: str_replace(): Passing null to parameter #2 ($replace) of type array|string is deprecated in
'rp' => array('rp' => 1, 'rt' => 1),
'rt' => array('rp' => 1, 'rt' => 1),
'td' => array('td' => 1, 'th' => 1),
'th' => array('td' => 1, 'th' => 1),
'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
$target_charset = DEFAULT_TARGET_CHARSET,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT,
if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
// Forcing tags to be closed implies that we don't trust the html, but
// it can lead to parsing errors if we SHOULD trust the html.
$this->optional_closing_array = array();
$this->_target_charset = $target_charset;
* @param string $defaultBRText
* @param string $defaultSpanText
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT,
$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
// Script tags removal now preceeds style tag removal.
// strip out <script> tags
$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
// strip out the \r \n's if we are told to.
$this->doc = str_replace("\r", ' ', $this->doc);
$this->doc = str_replace("\n", ' ', $this->doc);
// set the length of content since we have changed it.
$this->size = strlen($this->doc);
$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
$this->remove_noise("'<!--(.*?)-->'is");
// strip out <style> tags
$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
// strip out preformatted tags
$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
// strip out server side scripts
$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
$this->root->_[HDOM_INFO_END] = $this->cursor;
// make load function chainable
if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
function set_callback($function_name)
$this->callback = $function_name;
function remove_callback()
function save($filepath = '')
$ret = $this->root->innertext();
if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
* @param false $lowercase
* @return simple_html_dom_node | mixed
function find($selector, $idx = null, $lowercase = false)
return $this->root->find($selector, $idx, $lowercase);
if (isset($this->nodes)) {
foreach ($this->nodes as $n) {
// This add next line is documented in the sourceforge repository.
// 2977248 as a fix for ongoing memory leaks that occur even with the
if (isset($this->children)) {
foreach ($this->children as $n) {
if (isset($this->parent)) {
if (isset($this->root)) {
function dump($show_attr = true)
$this->root->dump($show_attr);
protected function prepare(
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT)
$this->size = strlen($this->doc);
$this->original_size = $this->size; // original size of the html
$this->lowercase = $lowercase;
$this->default_br_text = $defaultBRText;
$this->default_span_text = $defaultSpanText;
$this->root = new simple_html_dom_node($this);
$this->root->tag = 'root';
$this->root->_[HDOM_INFO_BEGIN] = -1;
$this->root->nodetype = HDOM_TYPE_ROOT;
$this->parent = $this->root;
if ($this->size > 0) { $this->char = $this->doc[0]; }
protected function parse()
// Read next tag if there is no text between current position and the
if (($s = $this->copy_until_char('<')) === '') {
// Add a text node for text between tags
$node = new simple_html_dom_node($this);
$node->_[HDOM_INFO_TEXT] = $s;
$this->link_nodes($node, false);
protected function parse_charset()
if (function_exists('get_last_retrieve_url_contents_content_type')) {
$contentTypeHeader = get_last_retrieve_url_contents_content_type();
$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
if (is_object($debug_object)) {
$debug_object->debug_log(2,
'header content-type found charset of: '
// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
$fullvalue = $el->content;
if (is_object($debug_object)) {
$debug_object->debug_log(2,
'meta content-type tag found'
if (!empty($fullvalue)) {
// If there is a meta tag, and they don't specify the
// character set, research says that it's typically
if (is_object($debug_object)) {
$debug_object->debug_log(2,
'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
if ($meta = $this->root->find('meta[charset]', 0)) {
$charset = $meta->charset;
if (is_object($debug_object)) {
$debug_object->debug_log(2, 'meta charset: ' . $charset);
// Try to guess the charset based on the content
// Requires Multibyte String (mbstring) support (optional)
if (function_exists('mb_detect_encoding')) {
* mb_detect_encoding() is not intended to distinguish between
* charsets, especially single-byte charsets. Its primary
* purpose is to detect which multibyte encoding is in use,
* i.e. UTF-8, UTF-16, shift-JIS, etc.
* -- https://bugs.php.net/bug.php?id=38138
* Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
* always result in CP1251/ISO-8859-5 and vice versa.
* Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
$encoding = mb_detect_encoding(
array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
// Due to a limitation of mb_detect_encoding
// 'CP1251'/'ISO-8859-5' will be detected as
// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
// which case we can simply assume it is the other charset.
if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
if ($encoding !== false) {
if (is_object($debug_object)) {
$debug_object->debug_log(2, 'mb_detect: ' . $charset);
// Assume it's UTF-8 as it is the most likely charset to be used
if (is_object($debug_object)) {
$debug_object->debug_log(2, 'No match found, assume ' . $charset);
// Since CP1252 is a superset, if we get one of it's subsets, we want
if ((strtolower($charset) == 'iso-8859-1')
|| (strtolower($charset) == 'latin1')
|| (strtolower($charset) == 'latin-1')) {
if (is_object($debug_object)) {
$debug_object->debug_log(2,
'replacing ' . $charset . ' with CP1252 as its a superset'
if (is_object($debug_object)) {
$debug_object->debug_log(1, 'EXIT - ' . $charset);
return $this->_charset = $charset;
protected function read_tag()
// Set end position if no further tags found
if ($this->char !== '<') {
$this->root->_[HDOM_INFO_END] = $this->cursor;
$begin_tag_pos = $this->pos;
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
if ($this->char === '/') {
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
// Skip whitespace in end tags (i.e. in "</ html>")
$this->skip($this->token_blank);
$tag = $this->copy_until_char('>');
// Skip attributes in end tags
if (($pos = strpos($tag, ' ')) !== false) {
$tag = substr($tag, 0, $pos);
$parent_lower = strtolower($this->parent->tag);
$tag_lower = strtolower($tag);
// The end tag is supposed to close the parent tag. Handle situations
if ($parent_lower !== $tag_lower) {
// Parent tag does not have to be closed necessarily (optional closing tag)
// Current tag is a block tag, so it may close an ancestor
if (isset($this->optional_closing_tags[$parent_lower])
&& isset($this->block_tags[$tag_lower])) {
$this->parent->_[HDOM_INFO_END] = 0;
$org_parent = $this->parent;
// Traverse ancestors to find a matching opening tag
while (($this->parent->parent)
&& strtolower($this->parent->tag) !== $tag_lower
$this->parent = $this->parent->parent;
// If we don't have a match add current tag as text node
if (strtolower($this->parent->tag) !== $tag_lower) {
$this->parent = $org_parent; // restore origonal parent
if ($this->parent->parent) {
$this->parent = $this->parent->parent;
$this->parent->_[HDOM_INFO_END] = $this->cursor;
return $this->as_text_node($tag);
} elseif (($this->parent->parent)
&& isset($this->block_tags[$tag_lower])
// Grandparent exists and current tag is a block tag, so our
// parent doesn't have an end tag
$this->parent->_[HDOM_INFO_END] = 0; // No end tag
$org_parent = $this->parent;
// Traverse ancestors to find a matching opening tag
while (($this->parent->parent)
&& strtolower($this->parent->tag) !== $tag_lower
$this->parent = $this->parent->parent;
// If we don't have a match add current tag as text node
if (strtolower($this->parent->tag) !== $tag_lower) {
$this->parent = $org_parent; // restore origonal parent
$this->parent->_[HDOM_INFO_END] = $this->cursor;
return $this->as_text_node($tag);
} elseif (($this->parent->parent)
&& strtolower($this->parent->parent->tag) === $tag_lower
) { // Grandparent exists and current tag closes it
$this->parent->_[HDOM_INFO_END] = 0;
$this->parent = $this->parent->parent;
} else { // Random tag, add as text node
return $this->as_text_node($tag);
// Set end position of parent tag to current cursor position
$this->parent->_[HDOM_INFO_END] = $this->cursor;
if ($this->parent->parent) {
$this->parent = $this->parent->parent;
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
$node = new simple_html_dom_node($this);
$node->_[HDOM_INFO_BEGIN] = $this->cursor;
$tag = $this->copy_until($this->token_slash); // Get tag name
$node->tag_start = $begin_tag_pos;
// doctype, cdata & comments...
if (isset($tag[0]) && $tag[0] === '!') {
$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
$node->nodetype = HDOM_TYPE_COMMENT;
} else { // Could be doctype or CDATA but we don't care
$node->nodetype = HDOM_TYPE_UNKNOWN;
if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
$this->link_nodes($node, true);
$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next