: str_replace(): Passing null to parameter #2 ($replace) of type array|string is deprecated in
require_once(dirname(__FILE__) . '/wfAPI.php');
require_once(dirname(__FILE__) . '/wfArray.php');
class wordfenceURLHoover {
public $errorMsg = false;
private $hostsToAdd = false;
private $wordpressVersion = false;
private $hostKeys = array();
private $hostList = array();
public $currentHooverID = false;
private $_foundSome = false;
private $_excludedHosts = array();
public static function standardExcludedHosts() {
static $standardExcludedHosts = null;
if ($standardExcludedHosts !== null) {
return $standardExcludedHosts;
$excludedHosts = array();
$blogIDs = $wpdb->get_col("SELECT blog_id FROM {$wpdb->blogs}"); //Can't use wp_get_sites or get_sites because they return empty at 10k sites
foreach ($blogIDs as $id) {
$homeURL = get_home_url($id);
$host = parse_url($homeURL, PHP_URL_HOST);
$excludedHosts[$host] = 1;
$siteURL = get_site_url($id);
$host = parse_url($siteURL, PHP_URL_HOST);
$excludedHosts[$host] = 1;
$homeURL = wfUtils::wpHomeURL();
$host = parse_url($homeURL, PHP_URL_HOST);
$excludedHosts[$host] = 1;
$siteURL = wfUtils::wpSiteURL();
$host = parse_url($siteURL, PHP_URL_HOST);
$excludedHosts[$host] = 1;
$standardExcludedHosts = array_keys($excludedHosts);
return $standardExcludedHosts;
public function __sleep() {
return array('debug', 'errorMsg', 'table', 'apiKey', 'wordpressVersion');
public function __wakeup() {
$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey'));
$this->api = new wfAPI($this->apiKey, $this->wordpressVersion);
public function __construct($apiKey, $wordpressVersion, $db = false, $continuation = false) {
$this->hostsToAdd = new wfArray(array('owner', 'host', 'path', 'hostKey'));
$this->wordpressVersion = $wordpressVersion;
$this->api = new wfAPI($apiKey, $wordpressVersion);
$this->table = wfDB::networkTable('wfHoover');
$this->table = 'wp_wfHoover';
public function cleanup() {
$this->db->truncate($this->table);
public function hoover($id, $data, $excludedHosts = array()) {
$this->currentHooverID = $id;
$this->_excludedHosts = $excludedHosts;
@preg_replace_callback('_((?:(?://)(?:\S+(?::\S*)?@)?(?:(?:(?:[a-z\xa1-\xff0-9.-]+)(?:\.(?:(?:xn--[a-z\xa1-\xff0-9-]+)|[a-z\xa1-\xff]{2,}))))(?::\d{2,5})?)(?:/[a-z0-9\-\_\.~\!\*\(\);\:@&\=\+\$,\?#\[\]%]*)*)_iS', array($this, 'captureURL'), $data);
return $this->_foundSome;
private function dbg($msg) {
if ($this->debug) { wordfence::status(4, 'info', $msg); }
public function captureURL($matches) {
$id = $this->currentHooverID;
$url = 'http:' . $matches[0];
if (!filter_var($url, FILTER_VALIDATE_URL)) {
$components = parse_url($url);
if (preg_match('/\.(xn--(?:[a-z0-9-]*)[a-z0-9]+|[a-z\xa1-\xff0-9]{2,})$/i', $components['host'], $tld)) {
$tld = strtolower($tld[1]);
if (strpos(wfConfig::get('tldlist', ''), '|' . $tld . '|') === false) {
foreach ($this->_excludedHosts as $h) {
if (strcasecmp($h, $components['host']) === 0) {
$host = (isset($components['host']) ? $components['host'] : '');
$path = (isset($components['path']) && !empty($components['path']) ? $components['path'] : '/');
$hashes = $this->_generateHashes($url);
foreach ($hashes as $h) {
$this->hostsToAdd->push(array('owner' => $id, 'host' => $host, 'path' => $path, 'hostKey' => wfUtils::substr($h, 0, 4)));
if($this->hostsToAdd->size() > 1000){ $this->writeHosts(); }
private function writeHosts() {
if ($this->hostsToAdd->size() < 1) { return; }
$sql = "INSERT INTO " . $this->table . " (owner, host, path, hostKey) VALUES ";
while ($elem = $this->hostsToAdd->shift()) {
//This may be an issue for hyperDB or other abstraction layers, but leaving it for now.
$sql .= sprintf("('%s', '%s', '%s', '%s'),",
$this->db->realEscape($elem['owner']),
$this->db->realEscape($elem['host']),
$this->db->realEscape($elem['path']),
$this->db->realEscape($elem['hostKey'])
$this->db->queryWrite($sql);
$this->hostsToAdd->collectGarbage();
while ($elem = $this->hostsToAdd->shift()) {
$keys = str_split($elem['hostKey'], 4);
$this->hostList[] = array(
'owner' => $elem['owner'],
'hostKey' => $elem['hostKey']
$this->hostsToAdd->collectGarbage();
public function getBaddies() {
wordfence::status(4, 'info', __("Gathering host keys.", 'wordfence'));
$useMySQLi = wfUtils::useMySQLi();
if ($useMySQLi) { //If direct-access MySQLi is available, we use it to minimize the memory footprint instead of letting it fetch everything into an array first
wordfence::status(4, 'info', __("Using MySQLi directly.", 'wordfence'));
$result = $dbh->query("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */
if (!is_object($result)) {
$this->errorMsg = "Unable to query database";
$this->dbg($this->errorMsg);
while ($row = $result->fetch_assoc()) {
$allHostKeys .= $row['hostKey'];
$q1 = $this->db->querySelect("SELECT DISTINCT hostKey FROM {$this->table} ORDER BY hostKey ASC LIMIT 100000"); /* We limit to 100,000 prefixes since more than that cannot be reliably checked within the default max_execution_time */
$allHostKeys .= $hRec['hostKey'];
$allHostKeys = implode('', array_values(array_unique($this->hostKeys)));
* Check hash prefixes first. Each one is a 4-byte binary prefix of a SHA-256 hash of the URL. The response will
* be a binary list of 4-byte indices; The full URL for each index should be sent in the secondary query to
* find the true good/bad status.
$allCount = wfUtils::strlen($allHostKeys) / 4;
$this->dbg("Checking {$allCount} hostkeys");
for ($i = 0; $i < $allCount; $i++) {
$key = wfUtils::substr($allHostKeys, $i * 4, 4);
$this->dbg("Checking hostkey: " . bin2hex($key));
wordfence::status(2, 'info', sprintf(/* translators: Number of domains. */ __("Checking %d host keys against Wordfence scanning servers.", 'wordfence'), $allCount));
$resp = $this->api->binCall('check_host_keys', $allHostKeys);
wordfence::status(2, 'info', __("Done host key check.", 'wordfence'));
$this->dbg("Done host key check");
if ($resp['code'] >= 200 && $resp['code'] <= 299) {
$this->dbg("Host key response: " . bin2hex($resp['data']));
$dataLen = wfUtils::strlen($resp['data']);
if ($dataLen > 0 && $dataLen % 2 == 0) {
$this->dbg("Checking response indexes");
for ($i = 0; $i < $dataLen; $i += 2) {
$idx = wfUtils::array_first(unpack('n', wfUtils::substr($resp['data'], $i, 2)));
$this->dbg("Checking index {$idx}");
$prefix = wfUtils::substr($allHostKeys, $idx * 4, 4);
$this->dbg("Got bad hostkey for record: " . bin2hex($prefix));
$this->dbg("Bad allHostKeys index: {$idx}");
$this->errorMsg = "Bad allHostKeys index: {$idx}";
$this->errorMsg = "Invalid data length received from Wordfence server: " . $dataLen;
$this->dbg($this->errorMsg);
$this->errorMsg = "Wordfence server responded with an error. HTTP code " . $resp['code'] . " and data: " . $resp['data'];
$badCount = wfUtils::strlen($badHostKeys) / 4;
//Reconcile flagged prefixes with their corresponding URLs
for ($i = 0; $i < $badCount; $i++) {
$prefix = wfUtils::substr($badHostKeys, $i * 4, 4);
* Putting a 10000 limit in here for sites that have a huge number of items with the same URL
* that repeats. This is an edge case. But if the URLs are malicious then presumably the admin
* will fix the malicious URLs and on subsequent scans the items (owners) that are above the
* 10000 limit will appear.
$q1 = $this->db->querySelect("SELECT DISTINCT owner, host, path FROM {$this->table} WHERE hostKey = %s LIMIT 10000", $prefix);
$url = 'http://' . $rec['host'] . $rec['path'];
if (!isset($urlsToCheck[$rec['owner']])) {
$urlsToCheck[$rec['owner']] = array();
if (!in_array($url, $urlsToCheck[$rec['owner']])) {
$urlsToCheck[$rec['owner']][] = $url;
foreach ($this->hostList as $rec) {
$pos = wfUtils::strpos($rec['hostKey'], $prefix);
if ($pos !== false && $pos % 4 == 0) {
$url = 'http://' . $rec['host'] . $rec['path'];
if (!isset($urlsToCheck[$rec['owner']])) {
$urlsToCheck[$rec['owner']] = array();
if (!in_array($url, $urlsToCheck[$rec['owner']])) {
$urlsToCheck[$rec['owner']][] = $url;
if ($totalURLs > 10000) { break; }
if (count($urlsToCheck) > 0) {
wordfence::status(2, 'info', sprintf(
/* translators: 1. Number of URLs. 2. Number of files. */
__('Checking %1$d URLs from %2$d sources.', 'wordfence'),
$badURLs = $this->api->call('check_bad_urls', array(), array('toCheck' => json_encode($urlsToCheck)));
wordfence::status(2, 'info', __("Done URL check.", 'wordfence'));
$this->dbg("Done URL check");
if (is_array($badURLs) && count($badURLs) > 0) {
foreach ($badURLs as $file => $badSiteList) {
if (!isset($finalResults[$file])) {
$finalResults[$file] = array();
foreach ($badSiteList as $badSite) {
$finalResults[$file][] = array(
$this->dbg("Confirmed " . count($badURLs) . " bad URLs");
protected function _generateHashes($url) {
//The GSB specification requires generating and sending hash prefixes for a number of additional similar URLs. See: https://developers.google.com/safe-browsing/v4/urls-hashing#suffixprefix-expressions
$canonicalURL = $this->_canonicalizeURL($url);
if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $canonicalURL, $matches)) {
$scheme = strtolower($matches[1]);
$canonicalURL = $matches[2];
//Separate URL and query string
if (preg_match('/^([^?]+)(\??.*)/', $canonicalURL, $matches)) {
$canonicalURL = $matches[1];
preg_match('~^(.*?)(?:(/.*)|$)~', $canonicalURL, $matches);
if (isset($matches[2])) {
$host = $this->_normalizeHost($host);
if (filter_var(trim($host, '[]'), FILTER_VALIDATE_IP)) {
$hostComponents = explode('.', $host);
$numComponents = count($hostComponents) - 7;
if ($numComponents < 1) {
for ($i = $numComponents; $i < count($hostComponents) - 1; $i++) {
$hosts[] = implode('.', array_slice($hostComponents, $i));
$pathComponents = array_filter(explode('/', $path));
$numComponents = min(count($pathComponents), 4);
for ($i = 1; $i < $numComponents; $i++) {
$paths[] = '/' . implode('/', array_slice($pathComponents, 0, $i)) . '/';
if (strlen($query) > 0) {
$paths[] = $path . '?' . $query;
$paths = array_reverse($paths); //So we start at the most specific and move to most generic
$hashes[$h] = hash('sha256', $h, true); //WFSB compatibility -- it uses hashes without the path
$hashes[$key] = hash('sha256', $key, true);
break; //We no longer have any use for the extra path variants, so just include the primary one and exit the loop after
protected function _canonicalizeURL($url) { //Based on https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization and Google's reference implementation https://github.com/google/safebrowsing/blob/master/urls.go
$url = $this->_array_first(explode('#', $url));
$url = preg_replace('/[\t\n\r]/', '', $url);
$url = $this->_normalizeEscape($url);
if ($url === false) { return false; }
if (preg_match('~^([a-z]+[a-z0-9+\.\-]*)://(.*)$~i', $url, $matches)) {
$scheme = strtolower($matches[1]);
//Separate URL and query string
if (preg_match('/^([^?]+)(\??.*)/', $url, $matches)) {
$endsWithSlash = substr($url, -1) == '/';
preg_match('~^(.*?)(?:(/.*)|$)~', $url, $matches);
if (isset($matches[2])) {
$host = $this->_normalizeHost($host);
if ($host === false) { return false; }
$path = preg_replace('~//+~', '/', $path); //Multiple slashes -> single slash
$path = preg_replace('~(?:^|/)\.(?:$|/)~', '/', $path); //. path components removed
while (preg_match('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', $path)) { //Resolve ..
$path = preg_replace('~/(?!\.\./)[^/]+/\.\.(?:$|/)~', '/', $path, 1);
$path = preg_replace('~(?:^|/)\.\.(?:$|/)~', '/', $path); //Eliminate .. at the beginning
$path = trim($path, '.');
$path = preg_replace('/\.\.+/', '.', $path);
if ($path == '.' || $path == '') {
else if ($endsWithSlash && substr($path, -1) != '/') {
return $scheme . '://' . $host . $path . $query;
protected function _normalizeEscape($url) {
while (preg_match('/%([0-9a-f]{2})/i', $url)) {
$url = preg_replace_callback('/%([0-9a-f]{2})/i', array($this, '_hex2binCallback'), $url);
return preg_replace_callback('/[\x00-\x20\x7f-\xff#%]/', array($this, '_bin2hexCallback'), $url);
protected function _hex2binCallback($matches) {
return wfUtils::hex2bin($matches[1]);
protected function _bin2hexCallback($matches) {