- # Unicode Support in PHP
- # How to use :
- # Output :
- # Converting Unicode characters to their numeric value and/or HTML entities using PHP
- # How to use :
- # Output :
- # Intl extention for Unicode support
- PHP Unicode Support in PHP
- Converting Unicode characters to “\uxxxx” format using PHP
- How to use :
- Output :
- Converting Unicode characters to their numeric value and/or HTML entities using PHP
- How to use :
- Output :
- Intl extention for Unicode support
# Unicode Support in PHP
You can use the following code for going back and forward.
if (!function_exists('codepoint_encode')) function codepoint_encode($str) return substr(json_encode($str), 1, -1); > > if (!function_exists('codepoint_decode')) function codepoint_decode($str) return json_decode(sprintf('"%s"', $str)); > >
# How to use :
echo "\nUse JSON encoding / decoding\n"; var_dump(codepoint_encode("我好")); var_dump(codepoint_decode('\u6211\u597d'));
# Output :
Use JSON encoding / decoding string(12) "\u6211\u597d" string(6) "我好"
# Converting Unicode characters to their numeric value and/or HTML entities using PHP
You can use the following code for going back and forward.
if (!function_exists('mb_internal_encoding')) function mb_internal_encoding($encoding = NULL) return ($from_encoding === NULL) ? iconv_get_encoding() : iconv_set_encoding($encoding); > > if (!function_exists('mb_convert_encoding')) function mb_convert_encoding($str, $to_encoding, $from_encoding = NULL) return iconv(($from_encoding === NULL) ? mb_internal_encoding() : $from_encoding, $to_encoding, $str); > > if (!function_exists('mb_chr')) function mb_chr($ord, $encoding = 'UTF-8') if ($encoding === 'UCS-4BE') return pack("N", $ord); > else return mb_convert_encoding(mb_chr($ord, 'UCS-4BE'), $encoding, 'UCS-4BE'); > > > if (!function_exists('mb_ord')) function mb_ord($char, $encoding = 'UTF-8') if ($encoding === 'UCS-4BE') list(, $ord) = (strlen($char) === 4) ? @unpack('N', $char) : @unpack('n', $char); return $ord; > else return mb_ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE'); > > > if (!function_exists('mb_htmlentities')) function mb_htmlentities($string, $hex = true, $encoding = 'UTF-8') return preg_replace_callback('/[\x-\x]/u', function ($match) use ($hex) return sprintf($hex ? '%X;' : '%d;', mb_ord($match[0])); >, $string); > > if (!function_exists('mb_html_entity_decode')) function mb_html_entity_decode($string, $flags = null, $encoding = 'UTF-8') return html_entity_decode($string, ($flags === NULL) ? ENT_COMPAT | ENT_HTML401 : $flags, $encoding); > >
# How to use :
echo "Get string from numeric DEC value\n"; var_dump(mb_chr(50319, 'UCS-4BE')); var_dump(mb_chr(271)); echo "\nGet string from numeric HEX value\n"; var_dump(mb_chr(0xC48F, 'UCS-4BE')); var_dump(mb_chr(0x010F)); echo "\nGet numeric value of character as DEC string\n"; var_dump(mb_ord('ď', 'UCS-4BE')); var_dump(mb_ord('ď')); echo "\nGet numeric value of character as HEX string\n"; var_dump(dechex(mb_ord('ď', 'UCS-4BE'))); var_dump(dechex(mb_ord('ď'))); echo "\nEncode / decode to DEC based HTML entities\n"; var_dump(mb_htmlentities('tchüß', false)); var_dump(mb_html_entity_decode('tchüß')); echo "\nEncode / decode to HEX based HTML entities\n"; var_dump(mb_htmlentities('tchüß')); var_dump(mb_html_entity_decode('tchüß'));
# Output :
Get string from numeric DEC value string(4) "ď" string(2) "ď" Get string from numeric HEX value string(4) "ď" string(2) "ď" Get numeric value of character as DEC int int(50319) int(271) Get numeric value of character as HEX string string(4) "c48f" string(3) "10f" Encode / decode to DEC based HTML entities string(15) "tchüß" string(7) "tchüß" Encode / decode to HEX based HTML entities string(15) "tchüß" string(7) "tchüß"
# Intl extention for Unicode support
Native string functions are mapped to single byte functions, they do not work well with Unicode. The extentions iconv and mbstring offer some support for Unicode, while the Intl-extention offers full support. Intl is a wrapper for the facto de standard ICU library, see http://site.icu-project.org
ICU offers full Internationalization of which Unicode is only a smaller part. You can do transcoding easily:
\UConverter::transcode($sString, 'UTF-8', 'UTF-8'); // strip bad bytes against attacks
But, do not dismiss iconv just yet, consider:
\iconv('UTF-8', 'ASCII//TRANSLIT', "Cliënt"); // output: "Client"
PHP Unicode Support in PHP
Converting Unicode characters to “\uxxxx” format using PHP
You can use the following code for going back and forward.
if (!function_exists('codepoint_encode')) < function codepoint_encode($str) < return substr(json_encode($str), 1, -1); >> if (!function_exists('codepoint_decode')) < function codepoint_decode($str) < return json_decode(sprintf('"%s"', $str)); >>
How to use :
echo "\nUse JSON encoding / decoding\n"; var_dump(codepoint_encode("我好")); var_dump(codepoint_decode('\u6211\u597d'));
Output :
Use JSON encoding / decoding string(12) "\u6211\u597d" string(6) "我好"
Converting Unicode characters to their numeric value and/or HTML entities using PHP
You can use the following code for going back and forward.
if (!function_exists('mb_internal_encoding')) < function mb_internal_encoding($encoding = NULL) < return ($from_encoding === NULL) ? iconv_get_encoding() : iconv_set_encoding($encoding); >> if (!function_exists('mb_convert_encoding')) < function mb_convert_encoding($str, $to_encoding, $from_encoding = NULL) < return iconv(($from_encoding === NULL) ? mb_internal_encoding() : $from_encoding, $to_encoding, $str); >> if (!function_exists('mb_chr')) < function mb_chr($ord, $encoding = 'UTF-8') < if ($encoding === 'UCS-4BE') < return pack("N", $ord); >else < return mb_convert_encoding(mb_chr($ord, 'UCS-4BE'), $encoding, 'UCS-4BE'); >> > if (!function_exists('mb_ord')) < function mb_ord($char, $encoding = 'UTF-8') < if ($encoding === 'UCS-4BE') < list(, $ord) = (strlen($char) === 4) ? @unpack('N', $char) : @unpack('n', $char); return $ord; >else < return mb_ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE'); >> > if (!function_exists('mb_htmlentities')) < function mb_htmlentities($string, $hex = true, $encoding = 'UTF-8') < return preg_replace_callback('/[\x-\x]/u', function ($match) use ($hex) < return sprintf($hex ? '&#x%X;' : '&#%d;', mb_ord($match[0])); >, $string); > > if (!function_exists('mb_html_entity_decode')) < function mb_html_entity_decode($string, $flags = null, $encoding = 'UTF-8') < return html_entity_decode($string, ($flags === NULL) ? ENT_COMPAT | ENT_HTML401 : $flags, $encoding); >>
How to use :
echo "Get string from numeric DEC value\n"; var_dump(mb_chr(50319, 'UCS-4BE')); var_dump(mb_chr(271)); echo "\nGet string from numeric HEX value\n"; var_dump(mb_chr(0xC48F, 'UCS-4BE')); var_dump(mb_chr(0x010F)); echo "\nGet numeric value of character as DEC string\n"; var_dump(mb_ord('ď', 'UCS-4BE')); var_dump(mb_ord('ď')); echo "\nGet numeric value of character as HEX string\n"; var_dump(dechex(mb_ord('ď', 'UCS-4BE'))); var_dump(dechex(mb_ord('ď'))); echo "\nEncode / decode to DEC based HTML entities\n"; var_dump(mb_htmlentities('tchüß', false)); var_dump(mb_html_entity_decode('tchüß')); echo "\nEncode / decode to HEX based HTML entities\n"; var_dump(mb_htmlentities('tchüß')); var_dump(mb_html_entity_decode('tchüß'));
Output :
Get string from numeric DEC value string(4) "ď" string(2) "ď" Get string from numeric HEX value string(4) "ď" string(2) "ď" Get numeric value of character as DEC int int(50319) int(271) Get numeric value of character as HEX string string(4) "c48f" string(3) "10f" Encode / decode to DEC based HTML entities string(15) "tchüß" string(7) "tchüß" Encode / decode to HEX based HTML entities string(15) "tchüß" string(7) "tchüß"
Intl extention for Unicode support
Native string functions are mapped to single byte functions, they do not work well with Unicode. The extentions iconv and mbstring offer some support for Unicode, while the Intl-extention offers full support. Intl is a wrapper for the facto de standard ICU library, see http://site.icu-project.org for detailed information that is not available on http://php.net/manual/en/book.intl.php . If you can not install the extention, have a look at an alternative implemention of Intl from the Symfony framework.
ICU offers full Internationalization of which Unicode is only a smaller part. You can do transcoding easily:
\UConverter::transcode($sString, 'UTF-8', 'UTF-8'); // strip bad bytes against attacks
But, do not dismiss iconv just yet, consider:
\iconv('UTF-8', 'ASCII//TRANSLIT', "Cliënt"); // output: "Client"