Fix File
•
/
home
/
sportsfe...
/
httpdocs
/
wp-conte...
/
plugins
/
accelera...
/
includes
/
vendor
/
tool
/
Dom
•
File:
Document.php
•
Content:
<?php namespace AmpProject\Dom; use AmpProject\Amp; use AmpProject\Attribute; use AmpProject\DevMode; use AmpProject\Dom\Document\Encoding; use AmpProject\Dom\Document\Option; use AmpProject\Exception\FailedToRetrieveRequiredDomElement; use AmpProject\Exception\MaxCssByteCountExceeded; use AmpProject\Optimizer\CssRule; use AmpProject\Tag; use DOMAttr; use DOMComment; use DOMDocument; use DOMElement; use DOMNode; use DOMNodeList; use DOMText; use DOMXPath; /** * Abstract away some of the difficulties of working with PHP's DOMDocument. * * @property DOMXPath $xpath XPath query object for this document. * @property Element $html The document's <html> element. * @property Element $head The document's <head> element. * @property Element $body The document's <body> element. * @property Element|null $viewport The document's viewport meta element. * @property DOMNodeList $ampElements The document's <amp-*> elements. * @property Element $ampCustomStyle The document's <style amp-custom> element. * @property int $ampCustomStyleByteCount Count of bytes of the CSS in the <style amp-custom> tag. * @property int $inlineStyleByteCount Count of bytes of the CSS in all of the inline style attributes. * * @package ampproject/amp-toolbox */ final class Document extends DOMDocument { /** * Default document type to use. * * @var string */ const DEFAULT_DOCTYPE = '<!DOCTYPE html>'; /** * Regular expression to match the HTML doctype. * * @var string */ const HTML_DOCTYPE_REGEX_PATTERN = '#<!doctype\s+html[^>]+?>#si'; /** * Attribute prefix for AMP-bind data attributes. * * @var string */ const AMP_BIND_DATA_ATTR_PREFIX = 'data-amp-bind-'; /** * Pattern for HTML attribute accounting for binding attr name in square brackets syntax, boolean attribute, * single/double-quoted attribute value, and unquoted attribute values. * * @var string */ const AMP_BIND_SQUARE_BRACKETS_ATTR_PATTERN = '#^\s+(?P<name>\[?[a-zA-Z0-9_\-]+\]?)' . '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#'; /** * Pattern for HTML attribute accounting for binding attr name in data attribute syntax, boolean attribute, * single/double-quoted attribute value, and unquoted attribute values. * * @var string */ const AMP_BIND_DATA_ATTRIBUTE_ATTR_PATTERN = '#^\s+(?P<name>(?:' . self::AMP_BIND_DATA_ATTR_PREFIX . ')?[a-zA-Z0-9_\-]+)' . '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#'; /** * Match all start tags that contain a binding attribute in square brackets syntax. * * @var string */ const AMP_BIND_SQUARE_START_PATTERN = '#<' . '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name. . '(?P<attrs>\s+' // Attributes. . '(?>[^>"\'\[\]]+|"[^"]*+"|\'[^\']*+\')*+' // Non-binding attributes tokens. . '\[[a-zA-Z0-9_\-]+\]' // One binding attribute key. . '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including // binding ones. . ')>#s'; /** * Match all start tags that contain a binding attribute in data attribute syntax. * * @var string */ const AMP_BIND_DATA_START_PATTERN = '#<' . '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name. . '(?P<attrs>\s+' // Attributes. . '(?>' // Match at least one attribute . '(?>' // prefixed with "data-amp-bind-". . '(?![a-zA-Z0-9_\-\s]*' . self::AMP_BIND_DATA_ATTR_PREFIX . '[a-zA-Z0-9_\-]+="[^"]*+"|\'[^\']*+\')' . '[^>"\']+|"[^"]*+"|\'[^\']*+\'' . ')*+' . '(?>[a-zA-Z0-9_\-\s]*' . self::AMP_BIND_DATA_ATTR_PREFIX . '[a-zA-Z0-9_\-]+' . ')' . ')+' . '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including // binding ones. . ')>#is'; /* * Regular expressions to fetch the individual structural tags. * These patterns were optimized to avoid extreme backtracking on large documents. */ const HTML_STRUCTURE_DOCTYPE_PATTERN = '/^(?<doctype>[^<]*(?>\s*<!--.*?-->\s*)*<!doctype(?>\s+[^>]+)?>)/is'; const HTML_STRUCTURE_HTML_START_TAG = '/^(?<html_start>[^<]*(?>\s*<!--.*?-->\s*)*<html(?>\s+[^>]*)?>)/is'; const HTML_STRUCTURE_HTML_END_TAG = '/(?<html_end><\/html(?>\s+[^>]*)?>.*)$/is'; const HTML_STRUCTURE_HEAD_START_TAG = '/^[^<]*(?><!--.*?-->\s*)*(?><head(?>\s+[^>]*)?>)/is'; const HTML_STRUCTURE_BODY_START_TAG = '/^[^<]*(?><!--.*-->\s*)*(?><body(?>\s+[^>]*)?>)/is'; const HTML_STRUCTURE_BODY_END_TAG = '/(?><\/body(?>\s+[^>]*)?>.*)$/is'; const HTML_STRUCTURE_HEAD_TAG = '/^(?>[^<]*(?><head(?>\s+[^>]*)?>).*?<\/head(?>\s+[^>]*)?>)/is'; // Regex patterns used for securing and restoring the doctype node. const HTML_SECURE_DOCTYPE_IF_NOT_FIRST_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)(!)(doctype)(\s+[^>]+?)(>)/i'; const HTML_RESTORE_DOCTYPE_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)' . '(!--amp-)(doctype)(\s+[^>]+?)(-->)/i'; // Regex pattern used for removing Internet Explorer conditional comments. const HTML_IE_CONDITIONAL_COMMENTS_PATTERN = '/<!--(?>\[if\s|<!\[endif)(?>[^>]+(?<!--)>)*(?>[^>]+(?<=--)>)/i'; /** * Xpath query to fetch the attributes that are being URL-encoded by saveHTML(). * * @var string */ const XPATH_URL_ENCODED_ATTRIBUTES_QUERY = './/*/@src|.//*/@href|.//*/@action'; /** * Xpath query to fetch the elements containing Mustache templates (both <template type=amp-mustache> and * <script type=text/plain template=amp-mustache>). * * @var string */ const XPATH_MUSTACHE_TEMPLATE_ELEMENTS_QUERY = './/self::template[ @type = "amp-mustache" ]' . '|//self::script[ @type = "text/plain" ' . 'and @template = "amp-mustache" ]'; /** * Error message to use when the __get() is triggered for an unknown property. * * @var string */ const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::'; /** * Charset compatibility tag for making DOMDocument behave. * * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. * * @var string */ const HTTP_EQUIV_META_TAG = '<meta http-equiv="content-type" content="text/html; charset=utf-8">'; // Regex patterns and values used for adding and removing http-equiv charsets for compatibility. // The opening tag pattern contains a comment to make sure we don't match a <head> tag within a comment. const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<head(?>\s+[^>]*)?>/is'; const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG; const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<html(?>\s+[^>]*)?>/is'; const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0<head>' . self::HTTP_EQUIV_META_TAG . '</head>'; const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '#<meta http-equiv=([\'"])content-type\1 ' . 'content=([\'"])text/html; ' . 'charset=utf-8\2>#i'; const HTML_HTTP_EQUIV_VALUE = 'content-type'; const HTML_HTTP_EQUIV_CONTENT_VALUE = 'text/html; charset=utf-8'; // Regex patterns used for finding tags or extracting attribute values in an HTML string. const HTML_FIND_TAG_WITHOUT_ATTRIBUTE_PATTERN = '/<%1$s[^>]*?>[^<]*(?><\/%1$s>)?/i'; const HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN = '/<%1$s [^>]*?\s*%2$s\s*=[^>]*?>[^<]*(?><\/%1$s>)?/i'; const HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN = '/%s=(?>([\'"])(?<full>.*)?\1|(?<partial>[^ \'";]+))/'; const HTML_FIND_TAG_DELIMITER = '/'; /** * Pattern to match an AMP emoji together with its variant (amp4ads, amp4email, ...). * * @var string */ const AMP_EMOJI_ATTRIBUTE_PATTERN = '/<html\s([^>]*?(?:' . Attribute::AMP_EMOJI_ALT . '|' . Attribute::AMP_EMOJI . ')(4(?:ads|email))?[^>]*?)>/i'; // Attribute to use as a placeholder to move the emoji AMP symbol (⚡) over to DOM. const EMOJI_AMP_ATTRIBUTE_PLACEHOLDER = 'emoji-amp'; // Patterns used for fixing the mangled encoding of src attributes with SVG data. const I_AMPHTML_SIZER_REGEX_PATTERN = '/(?<before_src><i-amphtml-sizer\s+[^>]*>\s*<img\s+[^>]*?\s+src=([\'"]))' . '(?<src>.*?)' . '(?<after_src>\2><\/i-amphtml-sizer>)/i'; const SRC_SVG_REGEX_PATTERN = '/^\s*(?<type>[^<]+)(?<value><svg[^>]+>)\s*$/i'; /** * XPath query to retrieve all <amp-*> tags, relative to the <body> node. * * @var string */ const XPATH_AMP_ELEMENTS_QUERY = ".//*[starts-with(name(), 'amp-')]"; /** * XPath query to retrieve the <style amp-custom> tag, relative to the <head> node. * * @var string */ const XPATH_AMP_CUSTOM_STYLE_QUERY = './/style[@amp-custom]'; /** * XPath query to fetch the inline style attributes, relative to the <body> node. * * @var string */ const XPATH_INLINE_STYLE_ATTRIBUTES_QUERY = './/@style'; /** * Associative array of options to configure the behavior of the DOM document abstraction. * * @see Option::DEFAULTS For a list of available options. * * @var array */ private $options; /** * Whether `data-ampdevmode` was initially set on the the document element. * * @var bool */ private $hasInitialAmpDevMode = false; /** * The original encoding of how the AmpProject\Dom\Document was created. * * This is stored to do an automatic conversion to UTF-8, which is * a requirement for AMP. * * @var string */ private $originalEncoding; /** * Store the <noscript> markup that was extracted to preserve it during parsing. * * The array keys are the element IDs for placeholder <meta> tags. * * @see maybeReplaceNoscriptElements() * @see maybeRestoreNoscriptElements() * * @var string[] */ private $noscriptPlaceholderComments = []; /** * Store whether mustache template tags were replaced and need to be restored. * * @see replaceMustacheTemplateTokens() * * @var bool */ private $mustacheTagsReplaced = false; /** * Whether we had secured a doctype that needs restoring or not. * * This is an int as it receives the $count from the preg_replace(). * * @var int */ private $securedDoctype = 0; /** * Whether the self-closing tags were transformed and need to be restored. * * This avoids duplicating this effort (maybe corrupting the DOM) on multiple calls to saveHTML(). * * @var bool */ private $selfClosingTagsTransformed = false; /** * Store the emoji that was used to represent the AMP attribute. * * There are a few variations, so we want to keep track of this. * * @see https://github.com/ampproject/amphtml/issues/25990 * * @var string */ private $usedAmpEmoji; /** * Store the current index by prefix. * * This is used to generate unique-per-prefix IDs. * * @var int[] */ private $indexCounter = []; /** * The maximum number of bytes of CSS that is enforced. * * A negative number will disable the byte count limit. * * @var int */ private $cssMaxByteCountEnforced = -1; /** * Store the names of the amp-bind attributes that were converted so that we can restore them later on. * * @var array<string> */ private $convertedAmpBindAttributes = []; /** * Creates a new AmpProject\Dom\Document object * * @link https://php.net/manual/domdocument.construct.php * * @param string $version Optional. The version number of the document as part of the XML declaration. * @param string $encoding Optional. The encoding of the document as part of the XML declaration. */ public function __construct($version = '', $encoding = null) { $this->originalEncoding = (string)$encoding ?: Encoding::UNKNOWN; parent::__construct($version ?: '1.0', Encoding::AMP); $this->registerNodeClass(DOMElement::class, Element::class); $this->options = Option::DEFAULTS; } /** * Named constructor to provide convenient way of transforming HTML into DOM. * * @param string $html HTML to turn into a DOM. * @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string * is passed. Defaults to an empty array. * @return Document|false DOM generated from provided HTML, or false if the transformation failed. */ public static function fromHtml($html, $options = []) { // Assume options are the encoding if a string is passed, for BC reasons. if (is_string($options)) { $options = [Option::ENCODING => $options]; } $encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null; $dom = new self('', $encoding); if (! $dom->loadHTML($html, $options)) { return false; } return $dom; } /** * Named constructor to provide convenient way of transforming a HTML fragment into DOM. * * The difference to Document::fromHtml() is that fragments are not normalized as to their structure. * * @param string $html HTML to turn into a DOM. * @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string * is passed. Defaults to an empty array. * @return Document|false DOM generated from provided HTML, or false if the transformation failed. */ public static function fromHtmlFragment($html, $options = []) { // Assume options are the encoding if a string is passed, for BC reasons. if (is_string($options)) { $options = [Option::ENCODING => $options]; } $encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null; $dom = new self('', $encoding); if (! $dom->loadHTMLFragment($html, $options)) { return false; } return $dom; } /** * Named constructor to provide convenient way of retrieving the DOM from a node. * * @param DOMNode $node Node to retrieve the DOM from. This is being modified by reference (!). * @return Document DOM generated from provided HTML, or false if the transformation failed. */ public static function fromNode(DOMNode &$node) { /** * Document of the node. * * If the node->ownerDocument returns null, the node is the document. * * @var DOMDocument */ $root = $node->ownerDocument === null ? $node : $node->ownerDocument; if ($root instanceof self) { return $root; } $dom = new self(); // We replace the $node by reference, to make sure the next lines of code will // work as expected with the new document. // Otherwise $dom and $node would refer to two different DOMDocuments. $node = $dom->importNode($node, true); $dom->appendChild($node); $dom->hasInitialAmpDevMode = $dom->documentElement->hasAttribute(DevMode::DEV_MODE_ATTRIBUTE); return $dom; } /** * Reset the internal optimizations of the Document object. * * This might be needed if you are doing an operation that causes the cached * nodes and XPath objects to point to the wrong document. * * @return self Reset version of the Document object. */ private function reset() { // Drop references to old DOM document. unset($this->xpath, $this->head, $this->body); // Reference of the document itself doesn't change here, but might need to change in the future. return $this; } /** * Load HTML from a string. * * @link https://php.net/manual/domdocument.loadhtml.php * * @param string $source The HTML string. * @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml * parameters if an int or string is passed. Defaults to an empty array. * @return bool true on success or false on failure. */ public function loadHTML($source, $options = []) { $source = $this->normalizeDocumentStructure($source); $success = $this->loadHTMLFragment($source, $options); if ($success) { $this->insertMissingCharset(); // Do some further clean-up. $this->deduplicateTag(Tag::HEAD); $this->deduplicateTag(Tag::BODY); $this->moveInvalidHeadNodesToBody(); $this->movePostBodyNodesToBody(); $this->convertHeadProfileToLink(); } return $success; } /** * Load a HTML fragment from a string. * * @param string $source The HTML fragment string. * @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml * parameters if an int or string is passed. Defaults to an empty array. * @return bool true on success or false on failure. */ public function loadHTMLFragment($source, $options = []) { // Assume options are the additional libxml flags if a string or int is passed, for BC reasons. if (is_string($options)) { $options = (int) $options; } if (is_int($options)) { $options = [Option::LIBXML_FLAGS => $options]; } $this->options = array_merge($this->options, $options); $this->reset(); $source = $this->convertAmpEmojiAttribute($source); $source = $this->convertAmpBindAttributes($source); $source = $this->replaceSelfClosingTags($source); $source = $this->maybeReplaceNoscriptElements($source); $source = $this->secureMustacheScriptTemplates($source); $source = $this->secureDoctypeNode($source); list($source, $this->originalEncoding) = $this->detectAndStripEncoding($source); if (Encoding::AMP !== strtolower($this->originalEncoding)) { $source = $this->adaptEncoding($source); } $source = $this->addHttpEquivCharset($source); $libxml_previous_state = libxml_use_internal_errors(true); $this->options[Option::LIBXML_FLAGS] |= LIBXML_COMPACT; /* * LIBXML_HTML_NODEFDTD is only available for libxml 2.7.8+. * This should be the case for PHP 5.4+, but some systems seem to compile against a custom libxml version that * is lower than expected. */ if (defined('LIBXML_HTML_NODEFDTD')) { $this->options[Option::LIBXML_FLAGS] |= constant('LIBXML_HTML_NODEFDTD'); } $success = parent::loadHTML($source, $this->options[Option::LIBXML_FLAGS]); libxml_clear_errors(); libxml_use_internal_errors($libxml_previous_state); if ($success) { $this->normalizeHtmlAttributes(); $this->restoreMustacheScriptTemplates(); $this->maybeRestoreNoscriptElements(); // Remove http-equiv charset again. $meta = $this->head->firstChild; // We might have leading comments we need to preserve here. while ($meta instanceof DOMComment) { $meta = $meta->nextSibling; } if ( $meta instanceof Element && Tag::META === $meta->tagName && self::HTML_HTTP_EQUIV_VALUE === $meta->getAttribute(Attribute::HTTP_EQUIV) && (self::HTML_HTTP_EQUIV_CONTENT_VALUE) === $meta->getAttribute(Attribute::CONTENT) ) { $this->head->removeChild($meta); } $this->hasInitialAmpDevMode = $this->documentElement->hasAttribute(DevMode::DEV_MODE_ATTRIBUTE); } return $success; } /** * Dumps the internal document into a string using HTML formatting. * * @link https://php.net/manual/domdocument.savehtml.php * * @param DOMNode|null $node Optional. Parameter to output a subset of the document. * @return string The HTML, or false if an error occurred. */ public function saveHTML(DOMNode $node = null) { return $this->saveHTMLFragment($node); } /** * Dumps the internal document fragment into a string using HTML formatting. * * @param DOMNode|null $node Optional. Parameter to output a subset of the document. * @return string The HTML fragment, or false if an error occurred. */ public function saveHTMLFragment(DOMNode $node = null) { $this->replaceMustacheTemplateTokens(); // Force-add http-equiv charset to make DOMDocument behave as it should. // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. $charset = $this->createElement(Tag::META); $charset->setAttribute(Attribute::HTTP_EQUIV, self::HTML_HTTP_EQUIV_VALUE); $charset->setAttribute(Attribute::CONTENT, self::HTML_HTTP_EQUIV_CONTENT_VALUE); $this->head->insertBefore($charset, $this->head->firstChild); if (null === $node || PHP_VERSION_ID >= 70300) { $html = parent::saveHTML($node); } else { $html = $this->extractNodeViaFragmentBoundaries($node); } // Remove http-equiv charset again. // It is also removed from the DOM again in case saveHTML() is used multiple times. $this->head->removeChild($charset); $html = preg_replace(self::HTML_GET_HTTP_EQUIV_TAG_PATTERN, '', $html, 1); $html = $this->restoreDoctypeNode($html); $html = $this->restoreMustacheTemplateTokens($html); $html = $this->restoreSelfClosingTags($html); $html = $this->restoreAmpBindAttributes($html); $html = $this->restoreAmpEmojiAttribute($html); $html = $this->fixSvgSourceAttributeEncoding($html); // Whitespace just causes unit tests to fail... so whitespace begone. if ('' === trim($html)) { return ''; } return $html; } /** * Get the current options of the Document instance. * * @return array */ public function getOptions() { return $this->options; } /** * Add the required utf-8 meta charset tag if it is still missing. */ private function insertMissingCharset() { // Bail if a charset tag is already present. if ($this->xpath->query('.//meta[ @charset ]')->item(0)) { return; } $charset = $this->createElement(Tag::META); $charset->setAttribute(Attribute::CHARSET, Encoding::AMP); $this->head->insertBefore($charset, $this->head->firstChild); } /** * Extract a node's HTML via fragment boundaries. * * Temporarily adds fragment boundary comments in order to locate the desired node to extract from * the given HTML document. This is required because libxml seems to only preserve whitespace when * serializing when calling DOMDocument::saveHTML() on the entire document. If you pass the element * to DOMDocument::saveHTML() then formatting whitespace gets added unexpectedly. This is seen to * be fixed in PHP 7.3, but for older versions of PHP the following workaround is needed. * * @param DOMNode $node Node to extract the HTML for. * @return string Extracted HTML string. */ private function extractNodeViaFragmentBoundaries(DOMNode $node) { $boundary = $this->getUniqueId('fragment_boundary'); $startBoundary = $boundary . ':start'; $endBoundary = $boundary . ':end'; $commentStart = $this->createComment($startBoundary); $commentEnd = $this->createComment($endBoundary); $node->parentNode->insertBefore($commentStart, $node); $node->parentNode->insertBefore($commentEnd, $node->nextSibling); $pattern = '/^.*?' . preg_quote("<!--{$startBoundary}-->", '/') . '(.*)' . preg_quote("<!--{$endBoundary}-->", '/') . '.*?\s*$/s'; $html = preg_replace($pattern, '$1', parent::saveHTML()); $node->parentNode->removeChild($commentStart); $node->parentNode->removeChild($commentEnd); return $html; } /** * Normalize the document structure. * * This makes sure the document adheres to the general structure that AMP requires: * ``` * <!DOCTYPE html> * <html> * <head> * <meta charset="utf-8"> * </head> * <body> * </body> * </html> * ``` * * @param string $content Content to normalize the structure of. * @return string Normalized content. */ private function normalizeDocumentStructure($content) { $matches = []; $doctype = self::DEFAULT_DOCTYPE; $htmlStart = '<html>'; $htmlEnd = '</html>'; // Strip IE conditional comments, which are supported by IE 5-9 only (which AMP doesn't support). $content = preg_replace(self::HTML_IE_CONDITIONAL_COMMENTS_PATTERN, '', $content); // Detect and strip <!doctype> tags. if (preg_match(self::HTML_STRUCTURE_DOCTYPE_PATTERN, $content, $matches)) { $doctype = $matches['doctype']; $content = preg_replace(self::HTML_STRUCTURE_DOCTYPE_PATTERN, '', $content, 1); } // Detect and strip <html> tags. if (preg_match(self::HTML_STRUCTURE_HTML_START_TAG, $content, $matches)) { $htmlStart = $matches['html_start']; $content = preg_replace(self::HTML_STRUCTURE_HTML_START_TAG, '', $content, 1); preg_match(self::HTML_STRUCTURE_HTML_END_TAG, $content, $matches); $htmlEnd = isset($matches['html_end']) ? $matches['html_end'] : $htmlEnd; $content = preg_replace(self::HTML_STRUCTURE_HTML_END_TAG, '', $content, 1); } // Detect <head> and <body> tags and add as needed. if (! preg_match(self::HTML_STRUCTURE_HEAD_START_TAG, $content, $matches)) { if (! preg_match(self::HTML_STRUCTURE_BODY_START_TAG, $content, $matches)) { // Both <head> and <body> missing. $content = "<head></head><body>{$content}</body>"; } else { // Only <head> missing. $content = "<head></head>{$content}"; } } elseif (! preg_match(self::HTML_STRUCTURE_BODY_END_TAG, $content, $matches)) { // Only <body> missing. // @todo This is an expensive regex operation, look into further optimization. $content = preg_replace(self::HTML_STRUCTURE_HEAD_TAG, '$0<body>', $content, 1); $content .= '</body>'; } $content = "{$htmlStart}{$content}{$htmlEnd}"; // Reinsert a standard doctype (while preserving any potentially leading comments). $doctype = preg_replace(self::HTML_DOCTYPE_REGEX_PATTERN, self::DEFAULT_DOCTYPE, $doctype); $content = "{$doctype}{$content}"; return $content; } /** * Normalize the structure of the document if it was already provided as a DOM. */ public function normalizeDomStructure() { if (! $this->documentElement) { $this->appendChild($this->createElement(Tag::HTML)); } if (Tag::HTML !== $this->documentElement->nodeName) { $nextSibling = $this->documentElement->nextSibling; /** * The old document element that we need to remove and replace as we cannot just move it around. * * @var Element */ $oldDocumentElement = $this->removeChild($this->documentElement); $html = $this->createElement(Tag::HTML); $this->insertBefore($html, $nextSibling); if ($oldDocumentElement->nodeName === Tag::HEAD) { $head = $oldDocumentElement; } else { $head = $this->getElementsByTagName(Tag::HEAD)->item(0); if (!$head) { $head = $this->createElement(Tag::HEAD); } } if (!$head instanceof Element) { throw FailedToRetrieveRequiredDomElement::forHeadElement($head); } $this->head = $head; $html->appendChild($this->head); if ($oldDocumentElement->nodeName === Tag::BODY) { $body = $oldDocumentElement; } else { $body = $this->getElementsByTagName(Tag::BODY)->item(0); if (!$body) { $body = $this->createElement(Tag::BODY); } } if (!$body instanceof Element) { throw FailedToRetrieveRequiredDomElement::forBodyElement($body); } $this->body = $body; $html->appendChild($this->body); if ($oldDocumentElement !== $this->body && $oldDocumentElement !== $this->head) { $this->body->appendChild($oldDocumentElement); } } else { $head = $this->getElementsByTagName(Tag::HEAD)->item(0); if (!$head) { $this->head = $this->createElement(Tag::HEAD); $this->documentElement->insertBefore($this->head, $this->documentElement->firstChild); } $body = $this->getElementsByTagName(Tag::BODY)->item(0); if (!$body) { $this->body = $this->createElement(Tag::BODY); $this->documentElement->appendChild($this->body); } } $this->moveInvalidHeadNodesToBody(); $this->movePostBodyNodesToBody(); } /** * Normalizes HTML attributes to be HTML5 compatible. * * Conditionally removes html[xmlns], and converts html[xml:lang] to html[lang]. */ private function normalizeHtmlAttributes() { if (! $this->html->hasAttributes()) { return; } $xmlns = $this->html->attributes->getNamedItem('xmlns'); if ($xmlns instanceof DOMAttr && 'http://www.w3.org/1999/xhtml' === $xmlns->nodeValue) { $this->html->removeAttributeNode($xmlns); } $xml_lang = $this->html->attributes->getNamedItem('xml:lang'); if ($xml_lang instanceof DOMAttr) { $lang_node = $this->html->attributes->getNamedItem('lang'); if ((! $lang_node || ! $lang_node->nodeValue) && $xml_lang->nodeValue) { // Move the html[xml:lang] value to html[lang]. $this->html->setAttribute('lang', $xml_lang->nodeValue); } $this->html->removeAttributeNode($xml_lang); } } /** * Move invalid head nodes back to the body. */ private function moveInvalidHeadNodesToBody() { // Walking backwards makes it easier to move elements in the expected order. $node = $this->head->lastChild; while ($node) { $nextSibling = $node->previousSibling; if (! $this->isValidHeadNode($node)) { $this->body->insertBefore($this->head->removeChild($node), $this->body->firstChild); } $node = $nextSibling; } } /** * Converts a possible head[profile] attribute to link[rel=profile]. * * The head[profile] attribute is only valid in HTML4, not HTML5. * So if it exists and isn't empty, add it to the <head> as a link[rel=profile] and strip the attribute. */ private function convertHeadProfileToLink() { if (! $this->head->hasAttribute(Attribute::PROFILE)) { return; } $profile = $this->head->getAttribute(Attribute::PROFILE); if ($profile) { $link = $this->createElement(Tag::LINK); $link->setAttribute(Attribute::REL, Attribute::PROFILE); $link->setAttribute(Attribute::HREF, $profile); $this->head->appendChild($link); } $this->head->removeAttribute(Attribute::PROFILE); } /** * Move any nodes appearing after </body> or </html> to be appended to the <body>. * * This accounts for markup that is output at shutdown, such markup from Query Monitor. Not only is elements after * the </body> not valid in AMP, but trailing elements after </html> will get wrapped in additional <html> elements. * While comment nodes would be allowed in AMP, everything is moved regardless so that source stack comments will * retain their relative position with the element nodes they annotate. */ private function movePostBodyNodesToBody() { // Move nodes (likely comments) from after the </body>. while ($this->body->nextSibling) { $this->body->appendChild($this->body->nextSibling); } // Move nodes from after the </html>. while ($this->documentElement->nextSibling) { $nextSibling = $this->documentElement->nextSibling; if ($nextSibling instanceof Element && Tag::HTML === $nextSibling->nodeName) { // Handle trailing elements getting wrapped in implicit duplicate <html>. while ($nextSibling->firstChild) { $this->body->appendChild($nextSibling->firstChild); } $nextSibling->parentNode->removeChild($nextSibling); // Discard now-empty implicit <html>. } else { $this->body->appendChild($this->documentElement->nextSibling); } } } /** * Force all self-closing tags to have closing tags. * * This is needed because DOMDocument isn't fully aware of these. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see restoreSelfClosingTags() Reciprocal function. * */ private function replaceSelfClosingTags($html) { static $regexPattern = null; if (null === $regexPattern) { $regexPattern = '#<(' . implode('|', Tag::SELF_CLOSING_TAGS) . ')([^>]*?)(?>\s*(?<!\\\\)\/)?>(?!</\1>)#'; } $this->selfClosingTagsTransformed = true; return preg_replace($regexPattern, '<$1$2></$1>', $html); } /** * Restore all self-closing tags again. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see replaceSelfClosingTags Reciprocal function. * */ private function restoreSelfClosingTags($html) { static $regexPattern = null; if (! $this->selfClosingTagsTransformed) { return $html; } if (null === $regexPattern) { $regexPattern = '#</(' . implode('|', Tag::SELF_CLOSING_TAGS) . ')>#i'; } $this->selfClosingTagsTransformed = false; return preg_replace($regexPattern, '', $html); } /** * Maybe replace noscript elements with placeholders. * * This is done because libxml<2.8 might parse them incorrectly. * When appearing in the head element, a noscript can cause the head to close prematurely * and the noscript gets moved to the body and anything after it which was in the head. * See <https://stackoverflow.com/questions/39013102/why-does-noscript-move-into-body-tag-instead-of-head-tag>. * This is limited to only running in the head element because this is where the problem lies, * and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements * in the body otherwise. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see maybeRestoreNoscriptElements() Reciprocal function. */ private function maybeReplaceNoscriptElements($html) { if (version_compare(LIBXML_DOTTED_VERSION, '2.8', '<')) { $html = preg_replace_callback( '#^.+?(?=<body)#is', function ($headMatches) { return preg_replace_callback( '#<noscript[^>]*>.*?</noscript>#si', function ($noscriptMatches) { $id = $this->getUniqueId('noscript'); $this->noscriptPlaceholderComments[$id] = $noscriptMatches[0]; return sprintf('<meta class="noscript-placeholder" id="%s">', $id); }, $headMatches[0] ); }, $html ); } return $html; } /** * Maybe restore noscript elements with placeholders. * * This is done because libxml<2.8 might parse them incorrectly. * When appearing in the head element, a noscript can cause the head to close prematurely * and the noscript gets moved to the body and anything after it which was in the head. * See <https://stackoverflow.com/questions/39013102/why-does-noscript-move-into-body-tag-instead-of-head-tag>. * This is limited to only running in the head element because this is where the problem lies, * and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements * in the body otherwise. * * @see maybeReplaceNoscriptElements() Reciprocal function. */ private function maybeRestoreNoscriptElements() { foreach ($this->noscriptPlaceholderComments as $id => $noscriptHtmlFragment) { $placeholderElement = $this->getElementById($id); if (!$placeholderElement || !$placeholderElement->parentNode) { continue; } $noscriptFragmentDocument = self::fromHtmlFragment($noscriptHtmlFragment); if (!$noscriptFragmentDocument) { continue; } $exportBody = $noscriptFragmentDocument->getElementsByTagName(Tag::BODY)->item(0); if (!$exportBody) { continue; } $importFragment = $this->createDocumentFragment(); while ($exportBody->firstChild) { $importNode = $exportBody->removeChild($exportBody->firstChild); $importNode = $this->importNode($importNode, true); $importFragment->appendChild($importNode); } $placeholderElement->parentNode->replaceChild($importFragment, $placeholderElement); } } /** * Secures instances of script[template="amp-mustache"] by renaming element to tmp-script, as a workaround to a * libxml parsing issue. * * This script can have closing tags of its children table and td stripped. * So this changes its name from script to tmp-script to avoid this. * * @link https://github.com/ampproject/amp-wp/issues/4254 * @see restoreMustacheScriptTemplates() Reciprocal function. * * @param string $html To replace the tag name that contains the mustache templates. * @return string The HTML, with the tag name of the mustache templates replaced. */ private function secureMustacheScriptTemplates($html) { return preg_replace( '#<script(\s[^>]*?template=(["\']?)amp-mustache\2[^>]*)>(.*?)</script\s*?>#is', '<tmp-script$1>$3</tmp-script>', $html ); } /** * Restores the tag names of script[template="amp-mustache"] elements that were replaced earlier. * * @see secureMustacheScriptTemplates() Reciprocal function. */ private function restoreMustacheScriptTemplates() { $tmp_script_elements = iterator_to_array($this->getElementsByTagName('tmp-script')); foreach ($tmp_script_elements as $tmp_script_element) { $script = $this->createElement(Tag::SCRIPT); foreach ($tmp_script_element->attributes as $attr) { $script->setAttribute($attr->nodeName, $attr->nodeValue); } while ($tmp_script_element->firstChild) { $script->appendChild($tmp_script_element->firstChild); } $tmp_script_element->parentNode->replaceChild($script, $tmp_script_element); } } /** * Replace AMP binding attributes with something that libxml can parse (as HTML5 data-* attributes). * * This is necessary because attributes in square brackets are not understood in PHP and * get dropped with an error raised: * > Warning: DOMDocument::loadHTML(): error parsing attribute name * * @see restoreAmpBindAttributes() Reciprocal function. * @link https://www.ampproject.org/docs/reference/components/amp-bind * * @param string $html HTML containing amp-bind attributes. * @return string HTML with AMP binding attributes replaced with HTML5 data-* attributes. */ private function convertAmpBindAttributes($html) { /** * Replace callback. * * @param array $tagMatches Tag matches. * @return string Replacement. */ $replaceCallback = function ($tagMatches) { // Strip the self-closing slash as long as it is not an attribute value, like for the href attribute. $oldAttrs = rtrim(preg_replace('#(?<!=)/$#', '', $tagMatches['attrs'])); $newAttrs = ''; $offset = 0; while (preg_match(self::AMP_BIND_SQUARE_BRACKETS_ATTR_PATTERN, substr($oldAttrs, $offset), $attrMatches)) { $offset += strlen($attrMatches[0]); if ('[' === $attrMatches['name'][0]) { $attrName = trim($attrMatches['name'], '[]'); $newAttrs .= ' ' . self::AMP_BIND_DATA_ATTR_PREFIX . $attrName; if (isset($attrMatches['value'])) { $newAttrs .= $attrMatches['value']; } $this->convertedAmpBindAttributes[] = $attrName; } else { $newAttrs .= $attrMatches[0]; } } // Bail on parse error which occurs when the regex isn't able to consume the entire $newAttrs string. if (strlen($oldAttrs) !== $offset) { return $tagMatches[0]; } return '<' . $tagMatches['name'] . $newAttrs . '>'; }; $converted = preg_replace_callback( self::AMP_BIND_SQUARE_START_PATTERN, $replaceCallback, $html ); /* * If the regex engine incurred an error during processing, for example exceeding the backtrack * limit, $converted will be null. In this case we return the originally passed document to allow * DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar * attributes, then everything should still work. * * See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue. * See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors. */ return (null !== $converted) ? $converted : $html; } /** * Convert AMP bind-attributes back to their original syntax. * * This is not guaranteed to produce the exact same result as the initial markup, as it is more of a best guess. * It can end up replacing the wrong attributes if the initial markup had inconsistent styling, mixing both syntaxes * for the same attribute. In either case, it will always produce working markup, so this is not that big of a deal. * * @see convertAmpBindAttributes() Reciprocal function. * @link https://www.ampproject.org/docs/reference/components/amp-bind * * @param string $html HTML with amp-bind attributes converted. * @return string HTML with amp-bind attributes restored. */ public function restoreAmpBindAttributes($html) { if ($this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_DATA_ATTRIBUTE) { // All amp-bind attributes should remain in their converted data attribute form. return $html; } if ( $this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_AUTO && empty($this->convertedAmpBindAttributes) ) { // Only previously converted amp-bind attributes should be restored, but none were converted. return $html; } /** * Replace callback. * * @param array $tagMatches Tag matches. * @return string Replacement. */ $replaceCallback = function ($tagMatches) { // Strip the self-closing slash as long as it is not an attribute value, like for the href attribute. $oldAttrs = rtrim(preg_replace('#(?<!=)/$#', '', $tagMatches['attrs'])); $newAttrs = ''; $offset = 0; while (preg_match(self::AMP_BIND_DATA_ATTRIBUTE_ATTR_PATTERN, substr($oldAttrs, $offset), $attrMatches)) { $offset += strlen($attrMatches[0]); $attrName = substr($attrMatches['name'], strlen(self::AMP_BIND_DATA_ATTR_PREFIX)); if ( $this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_SQUARE_BRACKETS || in_array($attrName, $this->convertedAmpBindAttributes, true) ) { $attrValue = isset($attrMatches['value']) ? $attrMatches['value'] : '=""'; $newAttrs .= " [{$attrName}]{$attrValue}"; } else { $newAttrs .= $attrMatches[0]; } } // Bail on parse error which occurs when the regex isn't able to consume the entire $newAttrs string. if (strlen($oldAttrs) !== $offset) { return $tagMatches[0]; } return '<' . $tagMatches['name'] . $newAttrs . '>'; }; $converted = preg_replace_callback( self::AMP_BIND_DATA_START_PATTERN, $replaceCallback, $html ); /* * If the regex engine incurred an error during processing, for example exceeding the backtrack * limit, $converted will be null. In this case we return the originally passed document to allow * DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar * attributes, then everything should still work. * * See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue. * See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors. */ return (null !== $converted) ? $converted : $html; } /** * Adapt the encoding of the content. * * @param string $source Source content to adapt the encoding of. * @return string Adapted content. */ private function adaptEncoding($source) { // No encoding was provided, so we need to guess. if (Encoding::UNKNOWN === $this->originalEncoding && function_exists('mb_detect_encoding')) { $this->originalEncoding = mb_detect_encoding($source, Encoding::DETECTION_ORDER, true); } // Guessing the encoding seems to have failed, so we assume UTF-8 instead. // In my testing, this was not possible as long as one ISO-8859-x is in the detection order. if (empty($this->originalEncoding)) { $this->originalEncoding = Encoding::AMP; // @codeCoverageIgnore } $this->originalEncoding = $this->sanitizeEncoding($this->originalEncoding); // Sanitization failed, so we do a last effort to auto-detect. if (Encoding::UNKNOWN === $this->originalEncoding && function_exists('mb_detect_encoding')) { $detectedEncoding = mb_detect_encoding($source, Encoding::DETECTION_ORDER, true); if ($detectedEncoding !== false) { $this->originalEncoding = $detectedEncoding; } } $target = false; if (Encoding::AMP !== strtolower($this->originalEncoding)) { $target = function_exists('mb_convert_encoding') ? mb_convert_encoding($source, Encoding::AMP, $this->originalEncoding) : false; } return false !== $target ? $target : $source; } /** * Detect the encoding of the document. * * @param string $content Content of which to detect the encoding. * @return array { * Detected encoding of the document, or false if none. * * @type string $content Potentially modified content. * @type string|false $encoding Encoding of the content, or false if not detected. * } */ private function detectAndStripEncoding($content) { $encoding = $this->originalEncoding; // Check for HTML 4 http-equiv meta tags. foreach ($this->findTags($content, Tag::META, Attribute::HTTP_EQUIV) as $potentialHttpEquivTag) { $encoding = $this->extractValue($potentialHttpEquivTag, Attribute::CHARSET); if (false !== $encoding) { $httpEquivTag = $potentialHttpEquivTag; } } // Strip all charset tags. if (isset($httpEquivTag)) { $content = str_replace($httpEquivTag, '', $content); } // Check for HTML 5 charset meta tag. This overrides the HTML 4 charset. $charsetTag = $this->findTag($content, Tag::META, Attribute::CHARSET); if ($charsetTag) { $encoding = $this->extractValue($charsetTag, Attribute::CHARSET); // Strip the encoding if it is not the required one. if (strtolower($encoding) !== Encoding::AMP) { $content = str_replace($charsetTag, '', $content); } } return [$content, $encoding]; } /** * Find a given tag with a given attribute. * * If multiple tags match, this method will only return the first one. * * @param string $content Content in which to find the tag. * @param string $element Element of the tag. * @param string $attribute Attribute that the tag contains. * @return string[] The requested tags. Returns an empty array if none found. */ private function findTags($content, $element, $attribute = null) { $matches = []; $pattern = empty($attribute) ? sprintf( self::HTML_FIND_TAG_WITHOUT_ATTRIBUTE_PATTERN, preg_quote($element, self::HTML_FIND_TAG_DELIMITER) ) : sprintf( self::HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN, preg_quote($element, self::HTML_FIND_TAG_DELIMITER), preg_quote($attribute, self::HTML_FIND_TAG_DELIMITER) ); if (preg_match($pattern, $content, $matches)) { return $matches; } return []; } /** * Find a given tag with a given attribute. * * If multiple tags match, this method will only return the first one. * * @param string $content Content in which to find the tag. * @param string $element Element of the tag. * @param string $attribute Attribute that the tag contains. * @return string|false The requested tag, or false if not found. */ private function findTag($content, $element, $attribute = null) { $matches = $this->findTags($content, $element, $attribute); if (empty($matches)) { return false; } return $matches[0]; } /** * Extract an attribute value from an HTML tag. * * @param string $tag Tag from which to extract the attribute. * @param string $attribute Attribute of which to extract the value. * @return string|false Extracted attribute value, false if not found. */ private function extractValue($tag, $attribute) { $matches = []; $pattern = sprintf( self::HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN, preg_quote($attribute, self::HTML_FIND_TAG_DELIMITER) ); if (preg_match($pattern, $tag, $matches)) { return empty($matches['full']) ? $matches['partial'] : $matches['full']; } return false; } /** * Sanitize the encoding that was detected. * * If sanitization fails, it will return 'auto', letting the conversion * logic try to figure it out itself. * * @param string $encoding Encoding to sanitize. * @return string Sanitized encoding. Falls back to 'auto' on failure. */ private function sanitizeEncoding($encoding) { $encoding = strtolower($encoding); if ($encoding === Encoding::AMP) { return $encoding; } if (! function_exists('mb_list_encodings')) { return $encoding; } static $knownEncodings = null; if (null === $knownEncodings) { $knownEncodings = array_map('strtolower', mb_list_encodings()); } if (array_key_exists($encoding, Encoding::MAPPINGS)) { $encoding = Encoding::MAPPINGS[$encoding]; } if (! in_array($encoding, $knownEncodings, true)) { return Encoding::UNKNOWN; } return $encoding; } /** * Replace Mustache template tokens to safeguard them from turning into HTML entities. * * Prevents amp-mustache syntax from getting URL-encoded in attributes when saveHTML is done. * While this is applying to the entire document, it only really matters inside of <template> * elements, since URL-encoding of curly braces in href attributes would not normally matter. * But when this is done inside of a <template> then it breaks Mustache. Since Mustache * is logic-less and curly braces are not unsafe for HTML, we can do a global replacement. * The replacement is done on the entire HTML document instead of just inside of the <template> * elements since it is faster and wouldn't change the outcome. * * @see restoreMustacheTemplateTokens() Reciprocal function. */ private function replaceMustacheTemplateTokens() { $templates = $this->xpath->query(self::XPATH_MUSTACHE_TEMPLATE_ELEMENTS_QUERY, $this->body); if (0 === $templates->length) { return; } $mustacheTagPlaceholders = $this->getMustacheTagPlaceholders(); foreach ($templates as $template) { foreach ($this->xpath->query(self::XPATH_URL_ENCODED_ATTRIBUTES_QUERY, $template) as $attribute) { $value = preg_replace_callback( $this->getMustacheTagPattern(), static function ($matches) use ($mustacheTagPlaceholders) { return $mustacheTagPlaceholders[trim($matches[0])]; }, $attribute->nodeValue, -1, $count ); if ($count) { // Note we cannot do `$attribute->nodeValue = $value` because the PHP DOM will try to parse any // entities. In the case of a URL value like '/foo/?bar=1&baz=2' the result is a warning for an // unterminated entity reference "baz". When the attribute value is updated via setAttribute() this // same problem does not occur, so that is why the following is used. $attribute->parentNode->setAttribute($attribute->nodeName, $value); $this->mustacheTagsReplaced = true; } } } } /** * Restore Mustache template tokens that were previously replaced. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see replaceMustacheTemplateTokens() Reciprocal function. * */ private function restoreMustacheTemplateTokens($html) { if (! $this->mustacheTagsReplaced) { return $html; } $mustacheTagPlaceholders = $this->getMustacheTagPlaceholders(); return str_replace( $mustacheTagPlaceholders, array_keys($mustacheTagPlaceholders), $html ); } /** * Get amp-mustache tag/placeholder mappings. * * @return string[] Mapping of mustache tag token to its placeholder. * @see \wpdb::placeholder_escape() */ private function getMustacheTagPlaceholders() { static $placeholders = null; if (null === $placeholders) { $placeholders = []; // Note: The order of these tokens is important, as it determines the order of the replacements. $tokens = [ '{{{', '}}}', '{{#', '{{^', '{{/', '{{', '}}', ]; foreach ($tokens as $token) { $placeholders[$token] = '_amp_mustache_' . md5(uniqid($token)); } } return $placeholders; } /** * Get a regular expression that matches all amp-mustache tags while consuming whitespace. * * Removing whitespace is needed to avoid DOMDocument turning whitespace into entities, like %20 for spaces. * * @return string Regex pattern to match amp-mustache tags with whitespace. */ private function getMustacheTagPattern() { static $tagPattern = null; if (null === $tagPattern) { $delimiter = ':'; $tags = []; foreach (array_keys($this->getMustacheTagPlaceholders()) as $token) { if ('{' === $token[0]) { $tags[] = preg_quote($token, $delimiter) . '\s*'; } else { $tags[] = '\s*' . preg_quote($token, $delimiter); } } $tagPattern = $delimiter . implode('|', $tags) . $delimiter; } return $tagPattern; } /** * Covert the emoji AMP symbol (⚡) into pure text. * * The emoji symbol gets stripped by DOMDocument::loadHTML(). * * @param string $source Source HTML string to convert the emoji AMP symbol in. * @return string Adapted source HTML string. */ private function convertAmpEmojiAttribute($source) { $this->usedAmpEmoji = ''; return preg_replace_callback( self::AMP_EMOJI_ATTRIBUTE_PATTERN, function ($matches) { // Split into individual attributes. $attributes = array_map( 'trim', array_filter( preg_split( '#(\s+[^"\'\s=]+(?:=(?:"[^"]+"|\'[^\']+\'|[^"\'\s]+))?)#', $matches[1], -1, PREG_SPLIT_DELIM_CAPTURE ) ) ); foreach ($attributes as $index => $attribute) { $attributeMatches = []; if ( preg_match( '/^(' . Attribute::AMP_EMOJI_ALT . '|' . Attribute::AMP_EMOJI . ')(4(?:ads|email))?$/i', $attribute, $attributeMatches ) ) { $this->usedAmpEmoji = $attributeMatches[1]; $variant = ! empty($attributeMatches[2]) ? $attributeMatches[2] : ''; $attributes[$index] = self::EMOJI_AMP_ATTRIBUTE_PLACEHOLDER . "=\"{$variant}\""; break; } } return '<html ' . implode(' ', $attributes) . '>'; }, $source, 1 ); } /** * Restore the emoji AMP symbol (⚡) from its pure text placeholder. * * @param string $html HTML string to restore the AMP emoji symbol in. * @return string Adapted HTML string. */ private function restoreAmpEmojiAttribute($html) { if (empty($this->usedAmpEmoji)) { return $html; } return preg_replace( '/(<html\s[^>]*?)' . preg_quote(self::EMOJI_AMP_ATTRIBUTE_PLACEHOLDER, '/') . '="([^"]*)"/i', '\1' . $this->usedAmpEmoji . '\2', $html, 1 ); } /** * Secure the original doctype node. * * We need to keep elements around that were prepended to the doctype, like comment node used for source-tracking. * As DOM_Document prepends a new doctype node and removes the old one if the first element is not the doctype, we * need to ensure the original one is not stripped (by changing its node type) and restore it later on. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see restoreDoctypeNode() Reciprocal function. * */ private function secureDoctypeNode($html) { return preg_replace( self::HTML_SECURE_DOCTYPE_IF_NOT_FIRST_PATTERN, '\1!--amp-\3\4-->', $html, 1, $this->securedDoctype ); } /** * Restore the original doctype node. * * @param string $html HTML string to adapt. * @return string Adapted HTML string. * @see secureDoctypeNode() Reciprocal function. * */ private function restoreDoctypeNode($html) { if (! $this->securedDoctype) { return $html; } return preg_replace(self::HTML_RESTORE_DOCTYPE_PATTERN, '\1!\3\4>', $html, 1); } /** * Process the HTML output string and tweak it as needed. * * @param string $html HTML output string to tweak. * @return string Tweaked HTML output string. */ public function fixSvgSourceAttributeEncoding($html) { return preg_replace_callback(self::I_AMPHTML_SIZER_REGEX_PATTERN, [$this, 'adaptSizer'], $html); } /** * Adapt the sizer element so that it validates against the AMP spec. * * @param array $matches Matches that the regular expression collected. * @return string Adapted string to use as replacement. */ private function adaptSizer($matches) { $src = $matches['src']; $src = htmlspecialchars_decode($src, ENT_NOQUOTES); $src = preg_replace_callback(self::SRC_SVG_REGEX_PATTERN, [$this, 'adaptSvg'], $src); return $matches['before_src'] . $src . $matches['after_src']; } /** * Adapt the SVG syntax within the sizer element so that it validates against the AMP spec. * * @param array $matches Matches that the regular expression collected. * @return string Adapted string to use as replacement. */ private function adaptSvg($matches) { return $matches['type'] . urldecode($matches['value']); } /** * Deduplicate a given tag. * * This keeps the first tag as the main tag and moves over all child nodes and attribute nodes from any subsequent * same tags over to remove them. * * @param string $tagName Name of the tag to deduplicate. */ public function deduplicateTag($tagName) { $tags = $this->getElementsByTagName($tagName); /** * Main tag to keep. * * @var Element|null $mainTag */ $mainTag = $tags->item(0); if (null === $mainTag) { return; } while ($tags->length > 1) { /** * Tag to remove. * * @var Element $tagToRemove */ $tagToRemove = $tags->item(1); foreach ($tagToRemove->childNodes as $childNode) { $mainTag->appendChild($childNode->parentNode->removeChild($childNode)); } while ($tagToRemove->hasAttributes()) { /** * Attribute node to move over to the main tag. * * @var DOMAttr $attribute */ $attribute = $tagToRemove->attributes->item(0); $tagToRemove->removeAttributeNode($attribute); // @TODO This doesn't deal properly with attributes present on both tags. Maybe overkill to add? // We could move over the copy_attributes from AMP_DOM_Utils to do this. $mainTag->setAttributeNode($attribute); } $tagToRemove->parentNode->removeChild($tagToRemove); } // Avoid doing the above query again if possible. if (in_array($tagName, [Tag::HEAD, Tag::BODY], true)) { $this->$tagName = $mainTag; } } /** * Determine whether a node can be in the head. * * @link https://github.com/ampproject/amphtml/blob/445d6e3be8a5063e2738c6f90fdcd57f2b6208be/validator/engine/htmlparser.js#L83-L100 * @link https://www.w3.org/TR/html5/document-metadata.html * * @param DOMNode $node Node. * @return bool Whether valid head node. */ public function isValidHeadNode(DOMNode $node) { return ( ($node instanceof Element && in_array($node->nodeName, Tag::ELEMENTS_ALLOWED_IN_HEAD, true)) || ($node instanceof DOMText && preg_match('/^\s*$/', $node->nodeValue)) // Whitespace text nodes are OK. || $node instanceof DOMComment ); } /** * Get auto-incremented ID unique to this class's instantiation. * * @param string $prefix Prefix. * @return string ID. */ private function getUniqueId($prefix = '') { if (array_key_exists($prefix, $this->indexCounter)) { ++$this->indexCounter[$prefix]; } else { $this->indexCounter[$prefix] = 0; } $uniqueId = (string)$this->indexCounter[$prefix]; if ($prefix) { $uniqueId = "{$prefix}-{$uniqueId}"; } return $uniqueId; } /** * Get the ID for an element. * * If the element does not have an ID, create one first. * * @param Element $element Element to get the ID for. * @param string $prefix Optional. The prefix to use (should not have a trailing dash). Defaults to 'i-amp-id'. * @return string ID to use. */ public function getElementId(Element $element, $prefix = 'i-amp') { if ($element->hasAttribute(Attribute::ID)) { return $element->getAttribute(Attribute::ID); } $id = $this->getUniqueId($prefix); while ($this->getElementById($id) instanceof Element) { $id = $this->getUniqueId($prefix); } $element->setAttribute(Attribute::ID, $id); return $id; } /** * Determine whether `data-ampdevmode` was initially set on the document element. * * @return bool */ public function hasInitialAmpDevMode() { return $this->hasInitialAmpDevMode; } /** * Add style(s) to the <style amp-custom> tag. * * @param string $style Style to add. * @throws MaxCssByteCountExceeded If the allowed max byte count is exceeded. */ public function addAmpCustomStyle($style) { $style = trim($style, CssRule::CSS_TRIM_CHARACTERS); $existingStyle = (string)$this->ampCustomStyle->textContent; // Inject new styles before any potential source map annotation comment like: /*# sourceURL=amp-custom.css */. // If not present, then just put it at the end of the stylesheet. This isn't strictly required, but putting the // source map comments at the end is the convention. $newStyle = preg_replace( ':(?=\s+/\*#[^*]+?\*/\s*$|$):s', $style, $existingStyle, 1 ); $newByteCount = strlen($newStyle); if ($this->getRemainingCustomCssSpace() < ($newByteCount - $this->ampCustomStyleByteCount)) { throw MaxCssByteCountExceeded::forAmpCustom($newStyle); } $this->ampCustomStyle->textContent = $newStyle; $this->ampCustomStyleByteCount = $newByteCount; } /** * Add the given number of bytes ot the total inline style byte count. * * @param int $byteCount Bytes to add. */ public function addInlineStyleByteCount($byteCount) { $this->inlineStyleByteCount += $byteCount; } /** * Get the remaining number bytes allowed for custom CSS. * * @return int */ public function getRemainingCustomCssSpace() { if ($this->cssMaxByteCountEnforced < 0) { // No CSS byte count limit is being enforced, so return the next best thing to +∞. return PHP_INT_MAX; } return max( 0, $this->cssMaxByteCountEnforced - (int)$this->ampCustomStyleByteCount - (int)$this->inlineStyleByteCount ); } /** * Magic getter to implement lazily-created, cached properties for the document. * * @param string $name Name of the property to get. * @return mixed Value of the property, or null if unknown property was requested. */ public function __get($name) { switch ($name) { case 'xpath': $this->xpath = new DOMXPath($this); return $this->xpath; case Tag::HTML: $html = $this->getElementsByTagName(Tag::HTML)->item(0); if ($html === null) { // Document was assembled manually and bypassed normalisation. $this->normalizeDomStructure(); $html = $this->getElementsByTagName(Tag::HTML)->item(0); } if (!$html instanceof Element) { throw FailedToRetrieveRequiredDomElement::forHtmlElement($html); } $this->html = $html; return $this->html; case Tag::HEAD: $head = $this->getElementsByTagName(Tag::HEAD)->item(0); if ($head === null) { // Document was assembled manually and bypassed normalisation. $this->normalizeDomStructure(); $head = $this->getElementsByTagName(Tag::HEAD)->item(0); } if (!$head instanceof Element) { throw FailedToRetrieveRequiredDomElement::forHeadElement($head); } $this->head = $head; return $this->head; case Tag::BODY: $body = $this->getElementsByTagName(Tag::BODY)->item(0); if ($body === null) { // Document was assembled manually and bypassed normalisation. $this->normalizeDomStructure(); $body = $this->getElementsByTagName(Tag::BODY)->item(0); } if (!$body instanceof Element) { throw FailedToRetrieveRequiredDomElement::forBodyElement($body); } $this->body = $body; return $this->body; case Attribute::VIEWPORT: // This is not cached as it could potentially be requested too early, before the viewport was added, and // the cache would then store null without rechecking later on after the viewport has been added. for ($node = $this->head->firstChild; $node !== null; $node = $node->nextSibling) { if ( $node instanceof Element && $node->tagName === Tag::META && $node->getAttribute(Attribute::NAME) === Attribute::VIEWPORT ) { return $node; } } return null; case 'ampElements': // This is not cached as we clone some elements during SSR transformations to avoid ending up with // partially transformed, broken elements. return $this->xpath->query(self::XPATH_AMP_ELEMENTS_QUERY, $this->body) ?: new DOMNodeList(); case 'ampCustomStyle': $ampCustomStyle = $this->xpath->query(self::XPATH_AMP_CUSTOM_STYLE_QUERY, $this->head)->item(0); if (!$ampCustomStyle instanceof Element) { $ampCustomStyle = $this->createElement(Tag::STYLE); $ampCustomStyle->appendChild($this->createAttribute(Attribute::AMP_CUSTOM)); $this->head->appendChild($ampCustomStyle); } $this->ampCustomStyle = $ampCustomStyle; return $this->ampCustomStyle; case 'ampCustomStyleByteCount': if (!isset($this->ampCustomStyle)) { $ampCustomStyle = $this->xpath->query(self::XPATH_AMP_CUSTOM_STYLE_QUERY, $this->head)->item(0); if (!$ampCustomStyle instanceof Element) { return 0; } else { $this->ampCustomStyle = $ampCustomStyle; } } if (!isset($this->ampCustomStyleByteCount)) { $this->ampCustomStyleByteCount = strlen($this->ampCustomStyle->textContent); } return $this->ampCustomStyleByteCount; case 'inlineStyleByteCount': if (!isset($this->inlineStyleByteCount)) { $this->inlineStyleByteCount = 0; $attributes = $this->xpath->query(self::XPATH_INLINE_STYLE_ATTRIBUTES_QUERY, $this->body); foreach ($attributes as $attribute) { $this->inlineStyleByteCount += strlen($attribute->textContent); } } return $this->inlineStyleByteCount; } // Mimic regular PHP behavior for missing notices. trigger_error(self::PROPERTY_GETTER_ERROR_MESSAGE . $name, E_USER_NOTICE); return null; } /** * Make sure we properly reinitialize on clone. * * @return void */ public function __clone() { $this->reset(); } /** * Create new element node. * * @link https://php.net/manual/domdocument.createelement.php * * This override only serves to provide the correct object type-hint for our extended Dom/Element class. * * @param string $name The tag name of the element. * @param string $value Optional. The value of the element. By default, an empty element will be created. * You can also set the value later with Element->nodeValue. * @return Element|false A new instance of class Element or false if an error occurred. */ public function createElement($name, $value = null) { $element = parent::createElement($name, $value); if (!$element instanceof Element) { return false; } return $element; } /** * Create new element node. * * @link https://php.net/manual/domdocument.createelement.php * * This override only serves to provide the correct object type-hint for our extended Dom/Element class. * * @param string $name The tag name of the element. * @param array $attributes Attributes to add to the newly created element. * @param string $value Optional. The value of the element. By default, an empty element will be created. * You can also set the value later with Element->nodeValue. * @return Element|false A new instance of class Element or false if an error occurred. */ public function createElementWithAttributes($name, $attributes, $value = null) { $element = parent::createElement($name, $value); if (!$element instanceof Element) { return false; } $element->setAttributes($attributes); return $element; } /** * Check whether the CSS maximum byte count is enforced. * * @return bool Whether the CSS maximum byte count is enforced. */ public function isCssMaxByteCountEnforced() { return $this->cssMaxByteCountEnforced >= 0; } /** * Enforce a maximum number of bytes for the CSS. * * @param int $maxByteCount Maximum number of bytes to limit the CSS to. A negative number disables the limit. */ public function enforceCssMaxByteCount($maxByteCount = Amp::MAX_CSS_BYTE_COUNT) { $this->cssMaxByteCountEnforced = $maxByteCount; } /** * Add a http-equiv charset meta tag to the document's <head> node. * * This is needed to make the DOMDocument behave as it should in terms of encoding. * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. * * @param string $html HTML string to add the http-equiv charset to. * @return string Adapted string of HTML. */ private function addHttpEquivCharset($html) { $count = 0; // We try first to detect an existing <head> node. $html = preg_replace( self::HTML_GET_HEAD_OPENING_TAG_PATTERN, self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT, $html, 1, $count ); // If no <head> was found, we look for the <html> tag instead. if ($count < 1) { $html = preg_replace( self::HTML_GET_HTML_OPENING_TAG_PATTERN, self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT, $html, 1, $count ); } // Finally, we just prepend the head with the required http-equiv charset. if ($count < 1) { $html = '<head>' . self::HTTP_EQUIV_META_TAG . '</head>' . $html; } return $html; } }
•
Search:
•
Replace:
Function
Edit by line
Download
Information
Rename
Copy
Move
Delete
Chmod
List