请注意,当与 UTF-8 一起使用时,mb_strtolower 仅会将标记为 Unicode 属性“大写字母”(“Lu”)的大写字符转换为小写。但是,也有一些字母,例如“字母数字”(Unicode 属性“Nl”),也具有小写和大写变体。这些字符不会被 mb_strtolower 转换!
示例
罗马字母 Ⅰ、Ⅱ、Ⅲ、...、Ⅿ(UTF-8 代码点 8544 到 8559)也存在于它们各自的小写变体 ⅰ、ⅱ、ⅲ、...、ⅿ(UTF-8 代码点 8560 到 8575)中,在我看来,也应该被 mb_strtolower 转换,但它们没有!
大型互联网公司(如 Google)将这两个变体视为语义上相等(因为表示形式仅在大小写上有所不同)。
由于我在互联网上找不到任何关于如何在 PHP 中将所有 UTF8 字符串映射到它们的小写对应物的正确解决方案,所以我提供以下针对 UTF-8 字符串的硬编码扩展的 mb_strtolower 函数
该函数包装了现有的函数 mb_strtolower(),并另外替换了具有小写表示形式的大写 UTF8 字符。由于我在互联网上找不到任何合适的 Unicode 大小写字符表,我检查了前一百万个 UTF8 字符,与 Google 搜索和关键字工具进行了比较,并确定以下 78 个字符为大写字符,未被 mb_strtolower 替换,但具有 UTF8 小写对应形式。
<?php
function strtolower_utf8_extended( $utf8_string )
{
$additional_replacements = array
( "Dž" => "dž" , "Lj" => "lj" , "Nj" => "nj" , "Dz" => "dz" , "Ϸ" => "ϸ" , "Ϲ" => "ϲ" , "Ϻ" => "ϻ" , "ᾈ" => "ᾀ" , "ᾉ" => "ᾁ" , "ᾊ" => "ᾂ" , "ᾋ" => "ᾃ" , "ᾌ" => "ᾄ" , "ᾍ" => "ᾅ" , "ᾎ" => "ᾆ" , "ᾏ" => "ᾇ" , "ᾘ" => "ᾐ" , "ᾙ" => "ᾑ" , "ᾚ" => "ᾒ" , "ᾛ" => "ᾓ" , "ᾜ" => "ᾔ" , "ᾝ" => "ᾕ" , "ᾞ" => "ᾖ" , "ᾟ" => "ᾗ" , "ᾨ" => "ᾠ" , "ᾩ" => "ᾡ" , "ᾪ" => "ᾢ" , "ᾫ" => "ᾣ" , "ᾬ" => "ᾤ" , "ᾭ" => "ᾥ" , "ᾮ" => "ᾦ" , "ᾯ" => "ᾧ" , "ᾼ" => "ᾳ" , "ῌ" => "ῃ" , "ῼ" => "ῳ" , "Ⅰ" => "ⅰ" , "Ⅱ" => "ⅱ" , "Ⅲ" => "ⅲ" , "Ⅳ" => "ⅳ" , "Ⅴ" => "ⅴ" , "Ⅵ" => "ⅵ" , "Ⅶ" => "ⅶ" , "Ⅷ" => "ⅷ" , "Ⅸ" => "ⅸ" , "Ⅹ" => "ⅹ" , "Ⅺ" => "ⅺ" , "Ⅻ" => "ⅻ" , "Ⅼ" => "ⅼ" , "Ⅽ" => "ⅽ" , "Ⅾ" => "ⅾ" , "Ⅿ" => "ⅿ" , "Ⓐ" => "ⓐ" , "Ⓑ" => "ⓑ" , "Ⓒ" => "ⓒ" , "Ⓓ" => "ⓓ" , "Ⓔ" => "ⓔ" , "Ⓕ" => "ⓕ" , "Ⓖ" => "ⓖ" , "Ⓗ" => "ⓗ" , "Ⓘ" => "ⓘ" , "Ⓙ" => "ⓙ" , "Ⓚ" => "ⓚ" , "Ⓛ" => "ⓛ" , "Ⓜ" => "ⓜ" , "Ⓝ" => "ⓝ" , "Ⓞ" => "ⓞ" , "Ⓟ" => "ⓟ" , "Ⓠ" => "ⓠ" , "Ⓡ" => "ⓡ" , "Ⓢ" => "ⓢ" , "Ⓣ" => "ⓣ" , "Ⓤ" => "ⓤ" , "Ⓥ" => "ⓥ" , "Ⓦ" => "ⓦ" , "Ⓧ" => "ⓧ" , "Ⓨ" => "ⓨ" , "Ⓩ" => "ⓩ" , "𐐦" => "𐑎" , "𐐧" => "𐑏" );
$utf8_string = mb_strtolower( $utf8_string, "UTF-8");
$utf8_string = strtr( $utf8_string, $additional_replacements );
return $utf8_string;
} ?>