: str_replace(): Passing null to parameter #2 ($replace) of type array|string is deprecated in
namespace AmpProject\Dom;
use AmpProject\Attribute;
use AmpProject\Dom\Document\Encoding;
use AmpProject\Dom\Document\Option;
use AmpProject\Exception\FailedToRetrieveRequiredDomElement;
use AmpProject\Exception\MaxCssByteCountExceeded;
use AmpProject\Optimizer\CssRule;
* Abstract away some of the difficulties of working with PHP's DOMDocument.
* @property DOMXPath $xpath XPath query object for this document.
* @property Element $html The document's <html> element.
* @property Element $head The document's <head> element.
* @property Element $body The document's <body> element.
* @property Element|null $viewport The document's viewport meta element.
* @property DOMNodeList $ampElements The document's <amp-*> elements.
* @property Element $ampCustomStyle The document's <style amp-custom> element.
* @property int $ampCustomStyleByteCount Count of bytes of the CSS in the <style amp-custom> tag.
* @property int $inlineStyleByteCount Count of bytes of the CSS in all of the inline style attributes.
* @package ampproject/amp-toolbox
final class Document extends DOMDocument
* Default document type to use.
const DEFAULT_DOCTYPE = '<!DOCTYPE html>';
* Regular expression to match the HTML doctype.
const HTML_DOCTYPE_REGEX_PATTERN = '#<!doctype\s+html[^>]+?>#si';
* Attribute prefix for AMP-bind data attributes.
const AMP_BIND_DATA_ATTR_PREFIX = 'data-amp-bind-';
* Pattern for HTML attribute accounting for binding attr name in square brackets syntax, boolean attribute,
* single/double-quoted attribute value, and unquoted attribute values.
const AMP_BIND_SQUARE_BRACKETS_ATTR_PATTERN = '#^\s+(?P<name>\[?[a-zA-Z0-9_\-]+\]?)'
. '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#';
* Pattern for HTML attribute accounting for binding attr name in data attribute syntax, boolean attribute,
* single/double-quoted attribute value, and unquoted attribute values.
const AMP_BIND_DATA_ATTRIBUTE_ATTR_PATTERN = '#^\s+(?P<name>(?:'
. self::AMP_BIND_DATA_ATTR_PREFIX
. '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#';
* Match all start tags that contain a binding attribute in square brackets syntax.
const AMP_BIND_SQUARE_START_PATTERN = '#<'
. '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name.
. '(?P<attrs>\s+' // Attributes.
. '(?>[^>"\'\[\]]+|"[^"]*+"|\'[^\']*+\')*+' // Non-binding attributes tokens.
. '\[[a-zA-Z0-9_\-]+\]' // One binding attribute key.
. '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including
* Match all start tags that contain a binding attribute in data attribute syntax.
const AMP_BIND_DATA_START_PATTERN = '#<'
. '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name.
. '(?P<attrs>\s+' // Attributes.
. '(?>' // Match at least one attribute
. '(?>' // prefixed with "data-amp-bind-".
. self::AMP_BIND_DATA_ATTR_PREFIX
. '[a-zA-Z0-9_\-]+="[^"]*+"|\'[^\']*+\')'
. '[^>"\']+|"[^"]*+"|\'[^\']*+\''
. self::AMP_BIND_DATA_ATTR_PREFIX
. '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including
* Regular expressions to fetch the individual structural tags.
* These patterns were optimized to avoid extreme backtracking on large documents.
const HTML_STRUCTURE_DOCTYPE_PATTERN = '/^(?<doctype>[^<]*(?>\s*<!--.*?-->\s*)*<!doctype(?>\s+[^>]+)?>)/is';
const HTML_STRUCTURE_HTML_START_TAG = '/^(?<html_start>[^<]*(?>\s*<!--.*?-->\s*)*<html(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_HTML_END_TAG = '/(?<html_end><\/html(?>\s+[^>]*)?>.*)$/is';
const HTML_STRUCTURE_HEAD_START_TAG = '/^[^<]*(?><!--.*?-->\s*)*(?><head(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_BODY_START_TAG = '/^[^<]*(?><!--.*-->\s*)*(?><body(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_BODY_END_TAG = '/(?><\/body(?>\s+[^>]*)?>.*)$/is';
const HTML_STRUCTURE_HEAD_TAG = '/^(?>[^<]*(?><head(?>\s+[^>]*)?>).*?<\/head(?>\s+[^>]*)?>)/is';
// Regex patterns used for securing and restoring the doctype node.
const HTML_SECURE_DOCTYPE_IF_NOT_FIRST_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)(!)(doctype)(\s+[^>]+?)(>)/i';
const HTML_RESTORE_DOCTYPE_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)'
. '(!--amp-)(doctype)(\s+[^>]+?)(-->)/i';
// Regex pattern used for removing Internet Explorer conditional comments.
const HTML_IE_CONDITIONAL_COMMENTS_PATTERN = '/<!--(?>\[if\s|<!\[endif)(?>[^>]+(?<!--)>)*(?>[^>]+(?<=--)>)/i';
* Xpath query to fetch the attributes that are being URL-encoded by saveHTML().
const XPATH_URL_ENCODED_ATTRIBUTES_QUERY = './/*/@src|.//*/@href|.//*/@action';
* Xpath query to fetch the elements containing Mustache templates (both <template type=amp-mustache> and
* <script type=text/plain template=amp-mustache>).
const XPATH_MUSTACHE_TEMPLATE_ELEMENTS_QUERY = './/self::template[ @type = "amp-mustache" ]'
. '|//self::script[ @type = "text/plain" '
. 'and @template = "amp-mustache" ]';
* Error message to use when the __get() is triggered for an unknown property.
const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::';
* Charset compatibility tag for making DOMDocument behave.
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
const HTTP_EQUIV_META_TAG = '<meta http-equiv="content-type" content="text/html; charset=utf-8">';
// Regex patterns and values used for adding and removing http-equiv charsets for compatibility.
// The opening tag pattern contains a comment to make sure we don't match a <head> tag within a comment.
const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<head(?>\s+[^>]*)?>/is';
const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG;
const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<html(?>\s+[^>]*)?>/is';
const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0<head>' . self::HTTP_EQUIV_META_TAG . '</head>';
const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '#<meta http-equiv=([\'"])content-type\1 '
. 'content=([\'"])text/html; '
const HTML_HTTP_EQUIV_VALUE = 'content-type';
const HTML_HTTP_EQUIV_CONTENT_VALUE = 'text/html; charset=utf-8';
// Regex patterns used for finding tags or extracting attribute values in an HTML string.
const HTML_FIND_TAG_WITHOUT_ATTRIBUTE_PATTERN = '/<%1$s[^>]*?>[^<]*(?><\/%1$s>)?/i';
const HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN = '/<%1$s [^>]*?\s*%2$s\s*=[^>]*?>[^<]*(?><\/%1$s>)?/i';
const HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN = '/%s=(?>([\'"])(?<full>.*)?\1|(?<partial>[^ \'";]+))/';
const HTML_FIND_TAG_DELIMITER = '/';
* Pattern to match an AMP emoji together with its variant (amp4ads, amp4email, ...).
const AMP_EMOJI_ATTRIBUTE_PATTERN = '/<html\s([^>]*?(?:'
. Attribute::AMP_EMOJI_ALT
. ')(4(?:ads|email))?[^>]*?)>/i';
// Attribute to use as a placeholder to move the emoji AMP symbol (âš¡) over to DOM.
const EMOJI_AMP_ATTRIBUTE_PLACEHOLDER = 'emoji-amp';
// Patterns used for fixing the mangled encoding of src attributes with SVG data.
const I_AMPHTML_SIZER_REGEX_PATTERN = '/(?<before_src><i-amphtml-sizer\s+[^>]*>\s*<img\s+[^>]*?\s+src=([\'"]))'
. '(?<after_src>\2><\/i-amphtml-sizer>)/i';
const SRC_SVG_REGEX_PATTERN = '/^\s*(?<type>[^<]+)(?<value><svg[^>]+>)\s*$/i';
* XPath query to retrieve all <amp-*> tags, relative to the <body> node.
const XPATH_AMP_ELEMENTS_QUERY = ".//*[starts-with(name(), 'amp-')]";
* XPath query to retrieve the <style amp-custom> tag, relative to the <head> node.
const XPATH_AMP_CUSTOM_STYLE_QUERY = './/style[@amp-custom]';
* XPath query to fetch the inline style attributes, relative to the <body> node.
const XPATH_INLINE_STYLE_ATTRIBUTES_QUERY = './/@style';
* Associative array of options to configure the behavior of the DOM document abstraction.
* @see Option::DEFAULTS For a list of available options.
* Whether `data-ampdevmode` was initially set on the the document element.
private $hasInitialAmpDevMode = false;
* The original encoding of how the AmpProject\Dom\Document was created.
* This is stored to do an automatic conversion to UTF-8, which is
private $originalEncoding;
* Store the <noscript> markup that was extracted to preserve it during parsing.
* The array keys are the element IDs for placeholder <meta> tags.
* @see maybeReplaceNoscriptElements()
* @see maybeRestoreNoscriptElements()
private $noscriptPlaceholderComments = [];
* Store whether mustache template tags were replaced and need to be restored.
* @see replaceMustacheTemplateTokens()
private $mustacheTagsReplaced = false;
* Whether we had secured a doctype that needs restoring or not.
* This is an int as it receives the $count from the preg_replace().
private $securedDoctype = 0;
* Whether the self-closing tags were transformed and need to be restored.
* This avoids duplicating this effort (maybe corrupting the DOM) on multiple calls to saveHTML().
private $selfClosingTagsTransformed = false;
* Store the emoji that was used to represent the AMP attribute.
* There are a few variations, so we want to keep track of this.
* @see https://github.com/ampproject/amphtml/issues/25990
* Store the current index by prefix.
* This is used to generate unique-per-prefix IDs.
private $indexCounter = [];
* The maximum number of bytes of CSS that is enforced.
* A negative number will disable the byte count limit.
private $cssMaxByteCountEnforced = -1;
* Store the names of the amp-bind attributes that were converted so that we can restore them later on.
private $convertedAmpBindAttributes = [];
* Creates a new AmpProject\Dom\Document object
* @link https://php.net/manual/domdocument.construct.php
* @param string $version Optional. The version number of the document as part of the XML declaration.
* @param string $encoding Optional. The encoding of the document as part of the XML declaration.
public function __construct($version = '', $encoding = null)
$this->originalEncoding = (string)$encoding ?: Encoding::UNKNOWN;
parent::__construct($version ?: '1.0', Encoding::AMP);
$this->registerNodeClass(DOMElement::class, Element::class);
$this->options = Option::DEFAULTS;
* Named constructor to provide convenient way of transforming HTML into DOM.
* @param string $html HTML to turn into a DOM.
* @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string
* is passed. Defaults to an empty array.
* @return Document|false DOM generated from provided HTML, or false if the transformation failed.
public static function fromHtml($html, $options = [])
// Assume options are the encoding if a string is passed, for BC reasons.
if (is_string($options)) {
$options = [Option::ENCODING => $options];
$encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null;
$dom = new self('', $encoding);
if (! $dom->loadHTML($html, $options)) {
* Named constructor to provide convenient way of transforming a HTML fragment into DOM.
* The difference to Document::fromHtml() is that fragments are not normalized as to their structure.
* @param string $html HTML to turn into a DOM.
* @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string
* is passed. Defaults to an empty array.
* @return Document|false DOM generated from provided HTML, or false if the transformation failed.
public static function fromHtmlFragment($html, $options = [])
// Assume options are the encoding if a string is passed, for BC reasons.
if (is_string($options)) {
$options = [Option::ENCODING => $options];
$encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null;
$dom = new self('', $encoding);
if (! $dom->loadHTMLFragment($html, $options)) {
* Named constructor to provide convenient way of retrieving the DOM from a node.
* @param DOMNode $node Node to retrieve the DOM from. This is being modified by reference (!).
* @return Document DOM generated from provided HTML, or false if the transformation failed.
public static function fromNode(DOMNode &$node)
* If the node->ownerDocument returns null, the node is the document.
$root = $node->ownerDocument === null ? $node : $node->ownerDocument;
if ($root instanceof self) {
// We replace the $node by reference, to make sure the next lines of code will
// work as expected with the new document.
// Otherwise $dom and $node would refer to two different DOMDocuments.
$node = $dom->importNode($node, true);
$dom->appendChild($node);
$dom->hasInitialAmpDevMode = $dom->documentElement->hasAttribute(DevMode::DEV_MODE_ATTRIBUTE);
* Reset the internal optimizations of the Document object.
* This might be needed if you are doing an operation that causes the cached
* nodes and XPath objects to point to the wrong document.
* @return self Reset version of the Document object.
// Drop references to old DOM document.
unset($this->xpath, $this->head, $this->body);
// Reference of the document itself doesn't change here, but might need to change in the future.
* Load HTML from a string.
* @link https://php.net/manual/domdocument.loadhtml.php
* @param string $source The HTML string.
* @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml
* parameters if an int or string is passed. Defaults to an empty array.
* @return bool true on success or false on failure.
public function loadHTML($source, $options = [])
$source = $this->normalizeDocumentStructure($source);
$success = $this->loadHTMLFragment($source, $options);
$this->insertMissingCharset();
// Do some further clean-up.
$this->deduplicateTag(Tag::HEAD);
$this->deduplicateTag(Tag::BODY);
$this->moveInvalidHeadNodesToBody();
$this->movePostBodyNodesToBody();
$this->convertHeadProfileToLink();
* Load a HTML fragment from a string.
* @param string $source The HTML fragment string.
* @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml
* parameters if an int or string is passed. Defaults to an empty array.
* @return bool true on success or false on failure.
public function loadHTMLFragment($source, $options = [])
// Assume options are the additional libxml flags if a string or int is passed, for BC reasons.
if (is_string($options)) {
$options = (int) $options;
$options = [Option::LIBXML_FLAGS => $options];