Edit File by line

<?php

[0] Fix | Delete

[1] Fix | Delete

/**

[2] Fix | Delete

* HTML API: WP_HTML_Decoder class

[3] Fix | Delete

[4] Fix | Delete

* Decodes spans of raw text found inside HTML content.

[5] Fix | Delete

[6] Fix | Delete

* @package WordPress

[7] Fix | Delete

* @subpackage HTML-API

[8] Fix | Delete

* @since 6.6.0

[9] Fix | Delete

[10] Fix | Delete

class WP_HTML_Decoder {

[11] Fix | Delete

/**

[12] Fix | Delete

* Indicates if an attribute value starts with a given raw string value.

[13] Fix | Delete

[14] Fix | Delete

* Use this method to determine if an attribute value starts with a given string, regardless

[15] Fix | Delete

* of how it might be encoded in HTML. For instance, `http:` could be represented as `http:`

[16] Fix | Delete

* or as `http&colon;` or as `http:` or as `http&colon;`, or in many other ways.

[17] Fix | Delete

[18] Fix | Delete

* Example:

[19] Fix | Delete

[20] Fix | Delete

* $value = 'http&colon;//wordpress.org/';

[21] Fix | Delete

* true === WP_HTML_Decoder::attribute_starts_with( $value, 'http:', 'ascii-case-insensitive' );

[22] Fix | Delete

* false === WP_HTML_Decoder::attribute_starts_with( $value, 'https:', 'ascii-case-insensitive' );

[23] Fix | Delete

[24] Fix | Delete

* @since 6.6.0

[25] Fix | Delete

[26] Fix | Delete

* @param string $haystack String containing the raw non-decoded attribute value.

[27] Fix | Delete

* @param string $search_text Does the attribute value start with this plain string.

[28] Fix | Delete

* @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching.

[29] Fix | Delete

* Default 'case-sensitive'.

[30] Fix | Delete

* @return bool Whether the attribute value starts with the given string.

[31] Fix | Delete

[32] Fix | Delete

public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) {

[33] Fix | Delete

$search_length = strlen( $search_text );

[34] Fix | Delete

$loose_case = 'ascii-case-insensitive' === $case_sensitivity;

[35] Fix | Delete

$haystack_end = strlen( $haystack );

[36] Fix | Delete

$search_at = 0;

[37] Fix | Delete

$haystack_at = 0;

[38] Fix | Delete

[39] Fix | Delete

while ( $search_at < $search_length && $haystack_at < $haystack_end ) {

[40] Fix | Delete

$chars_match = $loose_case

[41] Fix | Delete

? strtolower( $haystack[ $haystack_at ] ) === strtolower( $search_text[ $search_at ] )

[42] Fix | Delete

: $haystack[ $haystack_at ] === $search_text[ $search_at ];

[43] Fix | Delete

[44] Fix | Delete

$is_introducer = '&' === $haystack[ $haystack_at ];

[45] Fix | Delete

$next_chunk = $is_introducer

[46] Fix | Delete

? self::read_character_reference( 'attribute', $haystack, $haystack_at, $token_length )

[47] Fix | Delete

: null;

[48] Fix | Delete

[49] Fix | Delete

// If there's no character reference and the characters don't match, the match fails.

[50] Fix | Delete

if ( null === $next_chunk && ! $chars_match ) {

[51] Fix | Delete

return false;

[52] Fix | Delete

}

[53] Fix | Delete

[54] Fix | Delete

// If there's no character reference but the character do match, then it could still match.

[55] Fix | Delete

if ( null === $next_chunk && $chars_match ) {

[56] Fix | Delete

++$haystack_at;

[57] Fix | Delete

++$search_at;

[58] Fix | Delete

continue;

[59] Fix | Delete

}

[60] Fix | Delete

[61] Fix | Delete

// If there is a character reference, then the decoded value must exactly match what follows in the search string.

[62] Fix | Delete

if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, strlen( $next_chunk ), $loose_case ) ) {

[63] Fix | Delete

return false;

[64] Fix | Delete

}

[65] Fix | Delete

[66] Fix | Delete

// The character reference matched, so continue checking.

[67] Fix | Delete

$haystack_at += $token_length;

[68] Fix | Delete

$search_at += strlen( $next_chunk );

[69] Fix | Delete

}

[70] Fix | Delete

[71] Fix | Delete

return true;

[72] Fix | Delete

}

[73] Fix | Delete

[74] Fix | Delete

/**

[75] Fix | Delete

* Returns a string containing the decoded value of a given HTML text node.

[76] Fix | Delete

[77] Fix | Delete

* Text nodes appear in HTML DATA sections, which are the text segments inside

[78] Fix | Delete

* and around tags, excepting SCRIPT and STYLE elements (and some others),

[79] Fix | Delete

* whose inner text is not decoded. Use this function to read the decoded

[80] Fix | Delete

* value of such a text span in an HTML document.

[81] Fix | Delete

[82] Fix | Delete

* Example:

[83] Fix | Delete

[84] Fix | Delete

* '“😄”' === WP_HTML_Decode::decode_text_node( '😄&#x94' );

[85] Fix | Delete

[86] Fix | Delete

* @since 6.6.0

[87] Fix | Delete

[88] Fix | Delete

* @param string $text Text containing raw and non-decoded text node to decode.

[89] Fix | Delete

* @return string Decoded UTF-8 value of given text node.

[90] Fix | Delete

[91] Fix | Delete

public static function decode_text_node( $text ) {

[92] Fix | Delete

return static::decode( 'data', $text );

[93] Fix | Delete

}

[94] Fix | Delete

[95] Fix | Delete

/**

[96] Fix | Delete

* Returns a string containing the decoded value of a given HTML attribute.

[97] Fix | Delete

[98] Fix | Delete

* Text found inside an HTML attribute has different parsing rules than for

[99] Fix | Delete

* text found inside other markup, or DATA segments. Use this function to

[100] Fix | Delete

* read the decoded value of an HTML string inside a quoted attribute.

[101] Fix | Delete

[102] Fix | Delete

* Example:

[103] Fix | Delete

[104] Fix | Delete

* '“😄”' === WP_HTML_Decode::decode_attribute( '😄&#x94' );

[105] Fix | Delete

[106] Fix | Delete

* @since 6.6.0

[107] Fix | Delete

[108] Fix | Delete

* @param string $text Text containing raw and non-decoded attribute value to decode.

[109] Fix | Delete

* @return string Decoded UTF-8 value of given attribute value.

[110] Fix | Delete

[111] Fix | Delete

public static function decode_attribute( $text ) {

[112] Fix | Delete

return static::decode( 'attribute', $text );

[113] Fix | Delete

}

[114] Fix | Delete

[115] Fix | Delete

/**

[116] Fix | Delete

* Decodes a span of HTML text, depending on the context in which it's found.

[117] Fix | Delete

[118] Fix | Delete

* This is a low-level method; prefer calling WP_HTML_Decoder::decode_attribute() or

[119] Fix | Delete

* WP_HTML_Decoder::decode_text_node() instead. It's provided for cases where this

[120] Fix | Delete

* may be difficult to do from calling code.

[121] Fix | Delete

[122] Fix | Delete

* Example:

[123] Fix | Delete

[124] Fix | Delete

* '©' = WP_HTML_Decoder::decode( 'data', '©' );

[125] Fix | Delete

[126] Fix | Delete

* @since 6.6.0

[127] Fix | Delete

[128] Fix | Delete

* @access private

[129] Fix | Delete

[130] Fix | Delete

* @param string $context `attribute` for decoding attribute values, `data` otherwise.

[131] Fix | Delete

* @param string $text Text document containing span of text to decode.

[132] Fix | Delete

* @return string Decoded UTF-8 string.

[133] Fix | Delete

[134] Fix | Delete

public static function decode( $context, $text ) {

[135] Fix | Delete

$decoded = '';

[136] Fix | Delete

$end = strlen( $text );

[137] Fix | Delete

$at = 0;

[138] Fix | Delete

$was_at = 0;

[139] Fix | Delete

[140] Fix | Delete

while ( $at < $end ) {

[141] Fix | Delete

$next_character_reference_at = strpos( $text, '&', $at );

[142] Fix | Delete

if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {

[143] Fix | Delete

break;

[144] Fix | Delete

}

[145] Fix | Delete

[146] Fix | Delete

$character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $token_length );

[147] Fix | Delete

if ( isset( $character_reference ) ) {

[148] Fix | Delete

$at = $next_character_reference_at;

[149] Fix | Delete

$decoded .= substr( $text, $was_at, $at - $was_at );

[150] Fix | Delete

$decoded .= $character_reference;

[151] Fix | Delete

$at += $token_length;

[152] Fix | Delete

$was_at = $at;

[153] Fix | Delete

continue;

[154] Fix | Delete

}

[155] Fix | Delete

[156] Fix | Delete

++$at;

[157] Fix | Delete

}

[158] Fix | Delete

[159] Fix | Delete

if ( 0 === $was_at ) {

[160] Fix | Delete

return $text;

[161] Fix | Delete

}

[162] Fix | Delete

[163] Fix | Delete

if ( $was_at < $end ) {

[164] Fix | Delete

$decoded .= substr( $text, $was_at, $end - $was_at );

[165] Fix | Delete

}

[166] Fix | Delete

[167] Fix | Delete

return $decoded;

[168] Fix | Delete

}

[169] Fix | Delete

[170] Fix | Delete

/**

[171] Fix | Delete

* Attempt to read a character reference at the given location in a given string,

[172] Fix | Delete

* depending on the context in which it's found.

[173] Fix | Delete

[174] Fix | Delete

* If a character reference is found, this function will return the translated value

[175] Fix | Delete

* that the reference maps to. It will then set `$match_byte_length` the

[176] Fix | Delete

* number of bytes of input it read while consuming the character reference. This

[177] Fix | Delete

* gives calling code the opportunity to advance its cursor when traversing a string

[178] Fix | Delete

* and decoding.

[179] Fix | Delete

[180] Fix | Delete

* Example:

[181] Fix | Delete

[182] Fix | Delete

* null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 );

[183] Fix | Delete

* '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $token_length );

[184] Fix | Delete

* 8 === $token_length; // `…`

[185] Fix | Delete

[186] Fix | Delete

* null === WP_HTML_Decoder::read_character_reference( 'attribute', '&notin', 0 );

[187] Fix | Delete

* '∉' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $token_length );

[188] Fix | Delete

* 7 === $token_length; // `∉`

[189] Fix | Delete

[190] Fix | Delete

* '¬' === WP_HTML_Decoder::read_character_reference( 'data', '&notin', 0, $token_length );

[191] Fix | Delete

* 4 === $token_length; // `&not`

[192] Fix | Delete

* '∉' === WP_HTML_Decoder::read_character_reference( 'data', '∉', 0, $token_length );

[193] Fix | Delete

* 7 === $token_length; // `∉`

[194] Fix | Delete

[195] Fix | Delete

* @since 6.6.0

[196] Fix | Delete

[197] Fix | Delete

* @param string $context `attribute` for decoding attribute values, `data` otherwise.

[198] Fix | Delete

* @param string $text Text document containing span of text to decode.

[199] Fix | Delete

* @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0).

[200] Fix | Delete

* @param int &$match_byte_length Optional. Set to byte-length of character reference if provided and if a match

[201] Fix | Delete

* is found, otherwise not set. Default null.

[202] Fix | Delete

* @return string|false Decoded character reference in UTF-8 if found, otherwise `false`.

[203] Fix | Delete

[204] Fix | Delete

public static function read_character_reference( $context, $text, $at = 0, &$match_byte_length = null ) {

[205] Fix | Delete

/**

[206] Fix | Delete

* Mappings for HTML5 named character references.

[207] Fix | Delete

[208] Fix | Delete

* @var WP_Token_Map $html5_named_character_references

[209] Fix | Delete

[210] Fix | Delete

global $html5_named_character_references;

[211] Fix | Delete

[212] Fix | Delete

$length = strlen( $text );

[213] Fix | Delete

if ( $at + 1 >= $length ) {

[214] Fix | Delete

return null;

[215] Fix | Delete

}

[216] Fix | Delete

[217] Fix | Delete

if ( '&' !== $text[ $at ] ) {

[218] Fix | Delete

return null;

[219] Fix | Delete

}

[220] Fix | Delete

[221] Fix | Delete

[222] Fix | Delete

* Numeric character references.

[223] Fix | Delete

[224] Fix | Delete

* When truncated, these will encode the code point found by parsing the

[225] Fix | Delete

* digits that are available. For example, when `🅰` is truncated

[226] Fix | Delete

* to `&#x1f1` it will encode `Ǳ`. It does not:

[227] Fix | Delete

* - know how to parse the original `🅰`.

[228] Fix | Delete

* - fail to parse and return plaintext `&#x1f1`.

[229] Fix | Delete

* - fail to parse and return the replacement character `�`

[230] Fix | Delete

[231] Fix | Delete

if ( '#' === $text[ $at + 1 ] ) {

[232] Fix | Delete

if ( $at + 2 >= $length ) {

[233] Fix | Delete

return null;

[234] Fix | Delete

}

[235] Fix | Delete

[236] Fix | Delete

/** Tracks inner parsing within the numeric character reference. */

[237] Fix | Delete

$digits_at = $at + 2;

[238] Fix | Delete

[239] Fix | Delete

if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) {

[240] Fix | Delete

$numeric_base = 16;

[241] Fix | Delete

$numeric_digits = '0123456789abcdefABCDEF';

[242] Fix | Delete

$max_digits = 6; // 􏿿

[243] Fix | Delete

++$digits_at;

[244] Fix | Delete

} else {

[245] Fix | Delete

$numeric_base = 10;

[246] Fix | Delete

$numeric_digits = '0123456789';

[247] Fix | Delete

$max_digits = 7; // 􏿿

[248] Fix | Delete

}

[249] Fix | Delete

[250] Fix | Delete

// Cannot encode invalid Unicode code points. Max is to U+10FFFF.

[251] Fix | Delete

$zero_count = strspn( $text, '0', $digits_at );

[252] Fix | Delete

$digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count );

[253] Fix | Delete

$after_digits = $digits_at + $zero_count + $digit_count;

[254] Fix | Delete

$has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ];

[255] Fix | Delete

$end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits;

[256] Fix | Delete

[257] Fix | Delete

// `&#` or `&#x` without digits returns into plaintext.

[258] Fix | Delete

if ( 0 === $digit_count && 0 === $zero_count ) {

[259] Fix | Delete

return null;

[260] Fix | Delete

}

[261] Fix | Delete

[262] Fix | Delete

// Whereas `&#` and only zeros is invalid.

[263] Fix | Delete

if ( 0 === $digit_count ) {

[264] Fix | Delete

$match_byte_length = $end_of_span - $at;

[265] Fix | Delete

return '�';

[266] Fix | Delete

}

[267] Fix | Delete

[268] Fix | Delete

// If there are too many digits then it's not worth parsing. It's invalid.

[269] Fix | Delete

if ( $digit_count > $max_digits ) {

[270] Fix | Delete

$match_byte_length = $end_of_span - $at;

[271] Fix | Delete

return '�';

[272] Fix | Delete

}

[273] Fix | Delete

[274] Fix | Delete

$digits = substr( $text, $digits_at + $zero_count, $digit_count );

[275] Fix | Delete

$code_point = intval( $digits, $numeric_base );

[276] Fix | Delete

[277] Fix | Delete

[278] Fix | Delete

* Noncharacters, 0x0D, and non-ASCII-whitespace control characters.

[279] Fix | Delete

[280] Fix | Delete

* > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,

[281] Fix | Delete

* > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,

[282] Fix | Delete

* > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,

[283] Fix | Delete

* > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,

[284] Fix | Delete

* > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,

[285] Fix | Delete

* > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.

[286] Fix | Delete

[287] Fix | Delete

* A C0 control is a code point that is in the range of U+00 to U+1F,

[288] Fix | Delete

* but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D.

[289] Fix | Delete

[290] Fix | Delete

* These characters are invalid but still decode as any valid character.

[291] Fix | Delete

* This comment is here to note and explain why there's no check to

[292] Fix | Delete

* remove these characters or replace them.

[293] Fix | Delete

[294] Fix | Delete

* @see https://infra.spec.whatwg.org/#noncharacter

[295] Fix | Delete

[296] Fix | Delete

[297] Fix | Delete

[298] Fix | Delete

* Code points in the C1 controls area need to be remapped as if they

[299] Fix | Delete

* were stored in Windows-1252. Note! This transformation only happens

[300] Fix | Delete

* for numeric character references. The raw code points in the byte

[301] Fix | Delete

* stream are not translated.

[302] Fix | Delete

[303] Fix | Delete

* > If the number is one of the numbers in the first column of

[304] Fix | Delete

* > the following table, then find the row with that number in

[305] Fix | Delete

* > the first column, and set the character reference code to

[306] Fix | Delete

* > the number in the second column of that row.

[307] Fix | Delete

[308] Fix | Delete

if ( $code_point >= 0x80 && $code_point <= 0x9F ) {

[309] Fix | Delete

$windows_1252_mapping = array(

[310] Fix | Delete

0x20AC, // 0x80 -> EURO SIGN (€).

[311] Fix | Delete

0x81, // 0x81 -> (no change).

[312] Fix | Delete

0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚).

[313] Fix | Delete

0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ).

[314] Fix | Delete

0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („).

[315] Fix | Delete

0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…).

[316] Fix | Delete

0x2020, // 0x86 -> DAGGER (†).

[317] Fix | Delete

0x2021, // 0x87 -> DOUBLE DAGGER (‡).

[318] Fix | Delete

0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ).

[319] Fix | Delete

0x2030, // 0x89 -> PER MILLE SIGN (‰).

[320] Fix | Delete

0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š).

[321] Fix | Delete

0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹).

[322] Fix | Delete

0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ).

[323] Fix | Delete

0x8D, // 0x8D -> (no change).

[324] Fix | Delete

0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž).

[325] Fix | Delete

0x8F, // 0x8F -> (no change).

[326] Fix | Delete

0x90, // 0x90 -> (no change).

[327] Fix | Delete

0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘).

[328] Fix | Delete

0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’).

[329] Fix | Delete

0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“).

[330] Fix | Delete

0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”).

[331] Fix | Delete

0x2022, // 0x95 -> BULLET (•).

[332] Fix | Delete

0x2013, // 0x96 -> EN DASH (–).

[333] Fix | Delete

0x2014, // 0x97 -> EM DASH (—).

[334] Fix | Delete

0x02DC, // 0x98 -> SMALL TILDE (˜).

[335] Fix | Delete

0x2122, // 0x99 -> TRADE MARK SIGN (™).

[336] Fix | Delete

0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š).

[337] Fix | Delete

0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›).

[338] Fix | Delete

0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ).

[339] Fix | Delete

0x9D, // 0x9D -> (no change).

[340] Fix | Delete

0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž).

[341] Fix | Delete

0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ).

[342] Fix | Delete

);

[343] Fix | Delete

[344] Fix | Delete

$code_point = $windows_1252_mapping[ $code_point - 0x80 ];

[345] Fix | Delete

}

[346] Fix | Delete

[347] Fix | Delete

$match_byte_length = $end_of_span - $at;

[348] Fix | Delete

return self::code_point_to_utf8_bytes( $code_point );

[349] Fix | Delete

}

[350] Fix | Delete

[351] Fix | Delete

/** Tracks inner parsing within the named character reference. */

[352] Fix | Delete

$name_at = $at + 1;

[353] Fix | Delete

// Minimum named character reference is two characters. E.g. `GT`.

[354] Fix | Delete

if ( $name_at + 2 > $length ) {

[355] Fix | Delete

return null;

[356] Fix | Delete

}

[357] Fix | Delete

[358] Fix | Delete

$name_length = 0;

[359] Fix | Delete

$replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length );

[360] Fix | Delete

if ( false === $replacement ) {

[361] Fix | Delete

return null;

[362] Fix | Delete

}

[363] Fix | Delete

[364] Fix | Delete

$after_name = $name_at + $name_length;

[365] Fix | Delete

[366] Fix | Delete

// If the match ended with a semicolon then it should always be decoded.

[367] Fix | Delete

if ( ';' === $text[ $name_at + $name_length - 1 ] ) {

[368] Fix | Delete

$match_byte_length = $after_name - $at;

[369] Fix | Delete

return $replacement;

[370] Fix | Delete

}

[371] Fix | Delete

[372] Fix | Delete

[373] Fix | Delete

* At this point though there's a match for an entry in the named

[374] Fix | Delete

* character reference table but the match doesn't end in `;`.

[375] Fix | Delete

* It may be allowed if it's followed by something unambiguous.

[376] Fix | Delete

[377] Fix | Delete

$ambiguous_follower = (

[378] Fix | Delete

$after_name < $length &&

[379] Fix | Delete

$name_at < $length &&

[380] Fix | Delete

(

[381] Fix | Delete

ctype_alnum( $text[ $after_name ] ) ||

[382] Fix | Delete

'=' === $text[ $after_name ]

[383] Fix | Delete

)

[384] Fix | Delete

);

[385] Fix | Delete

[386] Fix | Delete

// It's non-ambiguous, safe to leave it in.

[387] Fix | Delete

if ( ! $ambiguous_follower ) {

[388] Fix | Delete

$match_byte_length = $after_name - $at;

[389] Fix | Delete

return $replacement;

[390] Fix | Delete

}

[391] Fix | Delete

[392] Fix | Delete

// It's ambiguous, which isn't allowed inside attributes.

[393] Fix | Delete

if ( 'attribute' === $context ) {

[394] Fix | Delete

return null;

[395] Fix | Delete

}

[396] Fix | Delete

[397] Fix | Delete

$match_byte_length = $after_name - $at;

[398] Fix | Delete

return $replacement;

[399] Fix | Delete

}

[400] Fix | Delete

[401] Fix | Delete

/**

[402] Fix | Delete

* Encode a code point number into the UTF-8 encoding.

[403] Fix | Delete

[404] Fix | Delete

* This encoder implements the UTF-8 encoding algorithm for converting

[405] Fix | Delete

* a code point into a byte sequence. If it receives an invalid code

[406] Fix | Delete

* point it will return the Unicode Replacement Character U+FFFD `�`.

[407] Fix | Delete

[408] Fix | Delete

* Example:

[409] Fix | Delete

[410] Fix | Delete

* '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 );

[411] Fix | Delete

[412] Fix | Delete

* // Half of a surrogate pair is an invalid code point.

[413] Fix | Delete

* '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c );

[414] Fix | Delete

[415] Fix | Delete

* @since 6.6.0

[416] Fix | Delete

[417] Fix | Delete

* @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.

[418] Fix | Delete

[419] Fix | Delete

* @param int $code_point Which code point to convert.

[420] Fix | Delete

* @return string Converted code point, or `�` if invalid.

[421] Fix | Delete

[422] Fix | Delete

public static function code_point_to_utf8_bytes( $code_point ) {

[423] Fix | Delete

// Pre-check to ensure a valid code point.

[424] Fix | Delete

if (

[425] Fix | Delete

$code_point <= 0 ||

[426] Fix | Delete

( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||

[427] Fix | Delete

$code_point > 0x10FFFF

[428] Fix | Delete

) {

[429] Fix | Delete

return '�';

[430] Fix | Delete

}

[431] Fix | Delete

[432] Fix | Delete

if ( $code_point <= 0x7F ) {

[433] Fix | Delete

return chr( $code_point );

[434] Fix | Delete

}

[435] Fix | Delete

[436] Fix | Delete

if ( $code_point <= 0x7FF ) {

[437] Fix | Delete

$byte1 = ( $code_point >> 6 ) | 0xC0;

[438] Fix | Delete

$byte2 = $code_point & 0x3F | 0x80;

[439] Fix | Delete

[440] Fix | Delete

return pack( 'CC', $byte1, $byte2 );

[441] Fix | Delete

}

[442] Fix | Delete

[443] Fix | Delete

if ( $code_point <= 0xFFFF ) {

[444] Fix | Delete

$byte1 = ( $code_point >> 12 ) | 0xE0;

[445] Fix | Delete

$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;

[446] Fix | Delete

$byte3 = $code_point & 0x3F | 0x80;

[447] Fix | Delete

[448] Fix | Delete

return pack( 'CCC', $byte1, $byte2, $byte3 );

[449] Fix | Delete

}

[450] Fix | Delete

[451] Fix | Delete

// Any values above U+10FFFF are eliminated above in the pre-check.

[452] Fix | Delete

$byte1 = ( $code_point >> 18 ) | 0xF0;

[453] Fix | Delete

$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;

[454] Fix | Delete

$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;

[455] Fix | Delete

$byte4 = $code_point & 0x3F | 0x80;

[456] Fix | Delete

[457] Fix | Delete

return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );

[458] Fix | Delete

}

[459] Fix | Delete

}

[460] Fix | Delete

[461] Fix | Delete