commit 6b6386ac38954400ce3057c697d42a2130cc376a Author: simon987 Date: Sun Jun 7 12:40:18 2020 -0400 initial commit, wip diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f5b17b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +cmake-build-debug/ +CMakeFiles +cmake_install.cmake +Makefile +CMakeCache.txt +*.so +*.cbp +.idea/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..ad7b8e9 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.13) +project(asciifolding C) + +set(CMAKE_C_STANDARD 99) + +find_package(PostgreSQL REQUIRED) + +include_directories(${PostgreSQL_INCLUDE_DIRS}) + +add_library(asciifolding SHARED asciifolding.c) +target_compile_options( + asciifolding + PRIVATE + -Ofast + -march=native +) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..db66558 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# PostgreSQL ASCII folding + +Reasonably fast ASCII folding functions (based on [Lucene asciifolding filter](https://lucene.apache.org/core/4_0_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html)) for PostgreSQL + +*Example:* +``` +postgres=# SELECT asciifold('Hello, ⒩ᴐⱤú⒴⁈~!'); + asciifold +---------------------- + Hello, (n)ORu(y)?!~! +(1 row) +``` + +UTF8 input string is not sanitized (invalid UTF8 might lead to undefined behavior) + +### Compiling from source (CMake) + +```bash +apt install postgresql-server-11-dev +cmake . +make +``` + +See [asciifolding.c](asciifolding.c) & [build.sh](build.sh) for more information + + diff --git a/asciifolding.c b/asciifolding.c new file mode 100644 index 0000000..5c4d483 --- /dev/null +++ b/asciifolding.c @@ -0,0 +1,757 @@ +#include +#include +#include "utf8.h" + +PG_MODULE_MAGIC; + +static const int offsets[] = {0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000006, + 0x00000007, 0x00000008, 0x00000009, 0x0000000A, 0x0000000B, 0x0000000C, 0x0000000D, + 0x0000000E, 0x0000000F, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, + 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x0000001A, 0x0000001B, + 0x0000001C, 0x0000001D, 0x0000001E, 0x0000001F, 0x00000020, 0x00000021, 0x00000022, + 0x00000023, 0x00000024, 0x00000025, 0x00000026, 0x00000027, 0x00000028, 0x00000029, + 0x0000002A, 0x0000002B, 0x0000002C, 0x0000002D, 0x0000002E, 0x0000002F, 0x00000030, + 0x00000031, 0x00000032, 0x00000033, 0x00000034, 0x00000035, 0x00000036, 0x00000037, + 0x00000038, 0x00000039, 0x0000003A, 0x0000003B, 0x0000003C, 0x0000003D, 0x0000003E, + 0x0000003F, 0x00000040, 0x00000041, 0x00000042, 0x00000043, 0x00000044, 0x00000045, + 0x00000046, 0x00000047, 0x00000048, 0x00000049, 0x0000004A, 0x0000004B, 0x0000004C, + 0x0000004D, 0x0000004E, 0x0000004F, 0x00000050, 0x00000051, 0x00000052, 0x00000053, + 0x00000054, 0x00000055, 0x00000056, 0x00000057, 0x00000058, 0x00000059, 0x0000005A, + 0x0000005B, 0x0000005C, 0x0000005D, 0x0000005E, 0x0000005F, 0x00000060, 0x00000061, + 0x00000062, 0x00000063, 0x00000064, 0x00000065, 0x00000066, 0x00000067, 0x00000068, + 0x00000069, 0x0000006A, 0x0000006B, 0x0000006C, 0x0000006D, 0x0000006E, 0x0000006F, + 0x00000070, 0x00000071, 0x00000072, 0x00000073, 0x00000074, 0x00000075, 0x00000076, + 0x00000077, 0x00000078, 0x00000079, 0x0000007A, 0x0000007B, 0x0000007C, 0x0000007D, + 0x0000007E, 0x0000007F, 0x000000C0, 0x000000C1, 0x000000C2, 0x000000C3, 0x000000C4, + 0x000000C5, 0x00000100, 0x00000102, 0x00000104, 0x0000018F, 0x000001CD, 0x000001DE, + 0x000001E0, 0x000001FA, 0x00000200, 0x00000202, 0x00000226, 0x0000023A, 0x00001D00, + 0x00001E00, 0x00001EA0, 0x00001EA2, 0x00001EA4, 0x00001EA6, 0x00001EA8, 0x00001EAA, + 0x00001EAC, 0x00001EAE, 0x00001EB0, 0x00001EB2, 0x00001EB4, 0x00001EB6, 0x000024B6, + 0x0000FF21, 0x000000E0, 0x000000E1, 0x000000E2, 0x000000E3, 0x000000E4, 0x000000E5, + 0x00000101, 0x00000103, 0x00000105, 0x000001CE, 0x000001DF, 0x000001E1, 0x000001FB, + 0x00000201, 0x00000203, 0x00000227, 0x00000250, 0x00000259, 0x0000025A, 0x00001D8F, + 0x00001D95, 0x00001E01, 0x00001E9A, 0x00001EA1, 0x00001EA3, 0x00001EA5, 0x00001EA7, + 0x00001EA9, 0x00001EAB, 0x00001EAD, 0x00001EAF, 0x00001EB1, 0x00001EB3, 0x00001EB5, + 0x00001EB7, 0x00002090, 0x00002094, 0x000024D0, 0x00002C65, 0x00002C6F, 0x0000FF41, + 0x0000A732, 0x000000C6, 0x000001E2, 0x000001FC, 0x00001D01, 0x0000A734, 0x0000A736, + 0x0000A738, 0x0000A73A, 0x0000A73C, 0x0000249C, 0x0000A733, 0x000000E6, 0x000001E3, + 0x000001FD, 0x00001D02, 0x0000A735, 0x0000A737, 0x0000A739, 0x0000A73B, 0x0000A73D, + 0x00000181, 0x00000182, 0x00000243, 0x00000299, 0x00001D03, 0x00001E02, 0x00001E04, + 0x00001E06, 0x000024B7, 0x0000FF22, 0x00000180, 0x00000183, 0x00000253, 0x00001D6C, + 0x00001D80, 0x00001E03, 0x00001E05, 0x00001E07, 0x000024D1, 0x0000FF42, 0x0000249D, + 0x000000C7, 0x00000106, 0x00000108, 0x0000010A, 0x0000010C, 0x00000187, 0x0000023B, + 0x00000297, 0x00001D04, 0x00001E08, 0x000024B8, 0x0000FF23, 0x000000E7, 0x00000107, + 0x00000109, 0x0000010B, 0x0000010D, 0x00000188, 0x0000023C, 0x00000255, 0x00001E09, + 0x00002184, 0x000024D2, 0x0000A73E, 0x0000A73F, 0x0000FF43, 0x0000249E, 0x000000D0, + 0x0000010E, 0x00000110, 0x00000189, 0x0000018A, 0x0000018B, 0x00001D05, 0x00001D06, + 0x00001E0A, 0x00001E0C, 0x00001E0E, 0x00001E10, 0x00001E12, 0x000024B9, 0x0000A779, + 0x0000FF24, 0x000000F0, 0x0000010F, 0x00000111, 0x0000018C, 0x00000221, 0x00000256, + 0x00000257, 0x00001D6D, 0x00001D81, 0x00001D91, 0x00001E0B, 0x00001E0D, 0x00001E0F, + 0x00001E11, 0x00001E13, 0x000024D3, 0x0000A77A, 0x0000FF44, 0x000001C4, 0x000001F1, + 0x000001C5, 0x000001F2, 0x0000249F, 0x00000238, 0x000001C6, 0x000001F3, 0x000002A3, + 0x000002A5, 0x000000C8, 0x000000C9, 0x000000CA, 0x000000CB, 0x00000112, 0x00000114, + 0x00000116, 0x00000118, 0x0000011A, 0x0000018E, 0x00000190, 0x00000204, 0x00000206, + 0x00000228, 0x00000246, 0x00001D07, 0x00001E14, 0x00001E16, 0x00001E18, 0x00001E1A, + 0x00001E1C, 0x00001EB8, 0x00001EBA, 0x00001EBC, 0x00001EBE, 0x00001EC0, 0x00001EC2, + 0x00001EC4, 0x00001EC6, 0x000024BA, 0x00002C7B, 0x0000FF25, 0x000000E8, 0x000000E9, + 0x000000EA, 0x000000EB, 0x00000113, 0x00000115, 0x00000117, 0x00000119, 0x0000011B, + 0x000001DD, 0x00000205, 0x00000207, 0x00000229, 0x00000247, 0x00000258, 0x0000025B, + 0x0000025C, 0x0000025D, 0x0000025E, 0x0000029A, 0x00001D08, 0x00001D92, 0x00001D93, + 0x00001D94, 0x00001E15, 0x00001E17, 0x00001E19, 0x00001E1B, 0x00001E1D, 0x00001EB9, + 0x00001EBB, 0x00001EBD, 0x00001EBF, 0x00001EC1, 0x00001EC3, 0x00001EC5, 0x00001EC7, + 0x00002091, 0x000024D4, 0x00002C78, 0x0000FF45, 0x000024A0, 0x00000191, 0x00001E1E, + 0x000024BB, 0x0000A730, 0x0000A77B, 0x0000A7FB, 0x0000FF26, 0x00000192, 0x00001D6E, + 0x00001D82, 0x00001E1F, 0x00001E9B, 0x000024D5, 0x0000A77C, 0x0000FF46, 0x000024A1, + 0x0000FB00, 0x0000FB03, 0x0000FB04, 0x0000FB01, 0x0000FB02, 0x0000011C, 0x0000011E, + 0x00000120, 0x00000122, 0x00000193, 0x000001E4, 0x000001E5, 0x000001E6, 0x000001E7, + 0x000001F4, 0x00000262, 0x0000029B, 0x00001E20, 0x000024BC, 0x0000A77D, 0x0000A77E, + 0x0000FF27, 0x0000011D, 0x0000011F, 0x00000121, 0x00000123, 0x000001F5, 0x00000260, + 0x00000261, 0x00001D77, 0x00001D79, 0x00001D83, 0x00001E21, 0x000024D6, 0x0000A77F, + 0x0000FF47, 0x000024A2, 0x00000124, 0x00000126, 0x0000021E, 0x0000029C, 0x00001E22, + 0x00001E24, 0x00001E26, 0x00001E28, 0x00001E2A, 0x000024BD, 0x00002C67, 0x00002C75, + 0x0000FF28, 0x00000125, 0x00000127, 0x0000021F, 0x00000265, 0x00000266, 0x000002AE, + 0x000002AF, 0x00001E23, 0x00001E25, 0x00001E27, 0x00001E29, 0x00001E2B, 0x00001E96, + 0x000024D7, 0x00002C68, 0x00002C76, 0x0000FF48, 0x000001F6, 0x000024A3, 0x00000195, + 0x000000CC, 0x000000CD, 0x000000CE, 0x000000CF, 0x00000128, 0x0000012A, 0x0000012C, + 0x0000012E, 0x00000130, 0x00000196, 0x00000197, 0x000001CF, 0x00000208, 0x0000020A, + 0x0000026A, 0x00001D7B, 0x00001E2C, 0x00001E2E, 0x00001EC8, 0x00001ECA, 0x000024BE, + 0x0000A7FE, 0x0000FF29, 0x000000EC, 0x000000ED, 0x000000EE, 0x000000EF, 0x00000129, + 0x0000012B, 0x0000012D, 0x0000012F, 0x00000131, 0x000001D0, 0x00000209, 0x0000020B, + 0x00000268, 0x00001D09, 0x00001D62, 0x00001D7C, 0x00001D96, 0x00001E2D, 0x00001E2F, + 0x00001EC9, 0x00001ECB, 0x00002071, 0x000024D8, 0x0000FF49, 0x00000132, 0x000024A4, + 0x00000133, 0x00000134, 0x00000248, 0x00001D0A, 0x000024BF, 0x0000FF2A, 0x00000135, + 0x000001F0, 0x00000237, 0x00000249, 0x0000025F, 0x00000284, 0x0000029D, 0x000024D9, + 0x00002C7C, 0x0000FF4A, 0x000024A5, 0x00000136, 0x00000198, 0x000001E8, 0x00001D0B, + 0x00001E30, 0x00001E32, 0x00001E34, 0x000024C0, 0x00002C69, 0x0000A740, 0x0000A742, + 0x0000A744, 0x0000FF2B, 0x00000137, 0x00000199, 0x000001E9, 0x0000029E, 0x00001D84, + 0x00001E31, 0x00001E33, 0x00001E35, 0x000024DA, 0x00002C6A, 0x0000A741, 0x0000A743, + 0x0000A745, 0x0000FF4B, 0x000024A6, 0x00000139, 0x0000013B, 0x0000013D, 0x0000013F, + 0x00000141, 0x0000023D, 0x0000029F, 0x00001D0C, 0x00001E36, 0x00001E38, 0x00001E3A, + 0x00001E3C, 0x000024C1, 0x00002C60, 0x00002C62, 0x0000A746, 0x0000A748, 0x0000A780, + 0x0000FF2C, 0x0000013A, 0x0000013C, 0x0000013E, 0x00000140, 0x00000142, 0x0000019A, + 0x00000234, 0x0000026B, 0x0000026C, 0x0000026D, 0x00001D85, 0x00001E37, 0x00001E39, + 0x00001E3B, 0x00001E3D, 0x000024DB, 0x00002C61, 0x0000A747, 0x0000A749, 0x0000A781, + 0x0000FF4C, 0x000001C7, 0x00001EFA, 0x000001C8, 0x000024A7, 0x000001C9, 0x00001EFB, + 0x000002AA, 0x000002AB, 0x0000019C, 0x00001D0D, 0x00001E3E, 0x00001E40, 0x00001E42, + 0x000024C2, 0x00002C6E, 0x0000A7FD, 0x0000A7FF, 0x0000FF2D, 0x0000026F, 0x00000270, + 0x00000271, 0x00001D6F, 0x00001D86, 0x00001E3F, 0x00001E41, 0x00001E43, 0x000024DC, + 0x0000FF4D, 0x000024A8, 0x000000D1, 0x00000143, 0x00000145, 0x00000147, 0x0000014A, + 0x0000019D, 0x000001F8, 0x00000220, 0x00000274, 0x00001D0E, 0x00001E44, 0x00001E46, + 0x00001E48, 0x00001E4A, 0x000024C3, 0x0000FF2E, 0x000000F1, 0x00000144, 0x00000146, + 0x00000148, 0x00000149, 0x0000014B, 0x0000019E, 0x000001F9, 0x00000235, 0x00000272, + 0x00000273, 0x00001D70, 0x00001D87, 0x00001E45, 0x00001E47, 0x00001E49, 0x00001E4B, + 0x0000207F, 0x000024DD, 0x0000FF4E, 0x000001CA, 0x000001CB, 0x000024A9, 0x000001CC, + 0x000000D2, 0x000000D3, 0x000000D4, 0x000000D5, 0x000000D6, 0x000000D8, 0x0000014C, + 0x0000014E, 0x00000150, 0x00000186, 0x0000019F, 0x000001A0, 0x000001D1, 0x000001EA, + 0x000001EC, 0x000001FE, 0x0000020C, 0x0000020E, 0x0000022A, 0x0000022C, 0x0000022E, + 0x00000230, 0x00001D0F, 0x00001D10, 0x00001E4C, 0x00001E4E, 0x00001E50, 0x00001E52, + 0x00001ECC, 0x00001ECE, 0x00001ED0, 0x00001ED2, 0x00001ED4, 0x00001ED6, 0x00001ED8, + 0x00001EDA, 0x00001EDC, 0x00001EDE, 0x00001EE0, 0x00001EE2, 0x000024C4, 0x0000A74A, + 0x0000A74C, 0x0000FF2F, 0x000000F2, 0x000000F3, 0x000000F4, 0x000000F5, 0x000000F6, + 0x000000F8, 0x0000014D, 0x0000014F, 0x00000151, 0x000001A1, 0x000001D2, 0x000001EB, + 0x000001ED, 0x000001FF, 0x0000020D, 0x0000020F, 0x0000022B, 0x0000022D, 0x0000022F, + 0x00000231, 0x00000254, 0x00000275, 0x00001D16, 0x00001D17, 0x00001D97, 0x00001E4D, + 0x00001E4F, 0x00001E51, 0x00001E53, 0x00001ECD, 0x00001ECF, 0x00001ED1, 0x00001ED3, + 0x00001ED5, 0x00001ED7, 0x00001ED9, 0x00001EDB, 0x00001EDD, 0x00001EDF, 0x00001EE1, + 0x00001EE3, 0x00002092, 0x000024DE, 0x00002C7A, 0x0000A74B, 0x0000A74D, 0x0000FF4F, + 0x00000152, 0x00000276, 0x0000A74E, 0x00000222, 0x00001D15, 0x000024AA, 0x00000153, + 0x00001D14, 0x0000A74F, 0x00000223, 0x000001A4, 0x00001D18, 0x00001E54, 0x00001E56, + 0x000024C5, 0x00002C63, 0x0000A750, 0x0000A752, 0x0000A754, 0x0000FF30, 0x000001A5, + 0x00001D71, 0x00001D7D, 0x00001D88, 0x00001E55, 0x00001E57, 0x000024DF, 0x0000A751, + 0x0000A753, 0x0000A755, 0x0000A7FC, 0x0000FF50, 0x000024AB, 0x0000024A, 0x000024C6, + 0x0000A756, 0x0000A758, 0x0000FF31, 0x00000138, 0x0000024B, 0x000002A0, 0x000024E0, + 0x0000A757, 0x0000A759, 0x0000FF51, 0x000024AC, 0x00000239, 0x00000154, 0x00000156, + 0x00000158, 0x00000210, 0x00000212, 0x0000024C, 0x00000280, 0x00000281, 0x00001D19, + 0x00001D1A, 0x00001E58, 0x00001E5A, 0x00001E5C, 0x00001E5E, 0x000024C7, 0x00002C64, + 0x0000A75A, 0x0000A782, 0x0000FF32, 0x00000155, 0x00000157, 0x00000159, 0x00000211, + 0x00000213, 0x0000024D, 0x0000027C, 0x0000027D, 0x0000027E, 0x0000027F, 0x00001D63, + 0x00001D72, 0x00001D73, 0x00001D89, 0x00001E59, 0x00001E5B, 0x00001E5D, 0x00001E5F, + 0x000024E1, 0x0000A75B, 0x0000A783, 0x0000FF52, 0x000024AD, 0x0000015A, 0x0000015C, + 0x0000015E, 0x00000160, 0x00000218, 0x00001E60, 0x00001E62, 0x00001E64, 0x00001E66, + 0x00001E68, 0x000024C8, 0x0000A731, 0x0000A785, 0x0000FF33, 0x0000015B, 0x0000015D, + 0x0000015F, 0x00000161, 0x0000017F, 0x00000219, 0x0000023F, 0x00000282, 0x00001D74, + 0x00001D8A, 0x00001E61, 0x00001E63, 0x00001E65, 0x00001E67, 0x00001E69, 0x00001E9C, + 0x00001E9D, 0x000024E2, 0x0000A784, 0x0000FF53, 0x00001E9E, 0x000024AE, 0x000000DF, + 0x0000FB06, 0x00000162, 0x00000164, 0x00000166, 0x000001AC, 0x000001AE, 0x0000021A, + 0x0000023E, 0x00001D1B, 0x00001E6A, 0x00001E6C, 0x00001E6E, 0x00001E70, 0x000024C9, + 0x0000A786, 0x0000FF34, 0x00000163, 0x00000165, 0x00000167, 0x000001AB, 0x000001AD, + 0x0000021B, 0x00000236, 0x00000287, 0x00000288, 0x00001D75, 0x00001E6B, 0x00001E6D, + 0x00001E6F, 0x00001E71, 0x00001E97, 0x000024E3, 0x00002C66, 0x0000FF54, 0x000000DE, + 0x0000A766, 0x0000A728, 0x000024AF, 0x000002A8, 0x000000FE, 0x00001D7A, 0x0000A767, + 0x000002A6, 0x0000A729, 0x000000D9, 0x000000DA, 0x000000DB, 0x000000DC, 0x00000168, + 0x0000016A, 0x0000016C, 0x0000016E, 0x00000170, 0x00000172, 0x000001AF, 0x000001D3, + 0x000001D5, 0x000001D7, 0x000001D9, 0x000001DB, 0x00000214, 0x00000216, 0x00000244, + 0x00001D1C, 0x00001D7E, 0x00001E72, 0x00001E74, 0x00001E76, 0x00001E78, 0x00001E7A, + 0x00001EE4, 0x00001EE6, 0x00001EE8, 0x00001EEA, 0x00001EEC, 0x00001EEE, 0x00001EF0, + 0x000024CA, 0x0000FF35, 0x000000F9, 0x000000FA, 0x000000FB, 0x000000FC, 0x00000169, + 0x0000016B, 0x0000016D, 0x0000016F, 0x00000171, 0x00000173, 0x000001B0, 0x000001D4, + 0x000001D6, 0x000001D8, 0x000001DA, 0x000001DC, 0x00000215, 0x00000217, 0x00000289, + 0x00001D64, 0x00001D99, 0x00001E73, 0x00001E75, 0x00001E77, 0x00001E79, 0x00001E7B, + 0x00001EE5, 0x00001EE7, 0x00001EE9, 0x00001EEB, 0x00001EED, 0x00001EEF, 0x00001EF1, + 0x000024E4, 0x0000FF55, 0x000024B0, 0x00001D6B, 0x000001B2, 0x00000245, 0x00001D20, + 0x00001E7C, 0x00001E7E, 0x00001EFC, 0x000024CB, 0x0000A75E, 0x0000A768, 0x0000FF36, + 0x0000028B, 0x0000028C, 0x00001D65, 0x00001D8C, 0x00001E7D, 0x00001E7F, 0x000024E5, + 0x00002C71, 0x00002C74, 0x0000A75F, 0x0000FF56, 0x0000A760, 0x000024B1, 0x0000A761, + 0x00000174, 0x000001F7, 0x00001D21, 0x00001E80, 0x00001E82, 0x00001E84, 0x00001E86, + 0x00001E88, 0x000024CC, 0x00002C72, 0x0000FF37, 0x00000175, 0x000001BF, 0x0000028D, + 0x00001E81, 0x00001E83, 0x00001E85, 0x00001E87, 0x00001E89, 0x00001E98, 0x000024E6, + 0x00002C73, 0x0000FF57, 0x000024B2, 0x00001E8A, 0x00001E8C, 0x000024CD, 0x0000FF38, + 0x00001D8D, 0x00001E8B, 0x00001E8D, 0x00002093, 0x000024E7, 0x0000FF58, 0x000024B3, + 0x000000DD, 0x00000176, 0x00000178, 0x000001B3, 0x00000232, 0x0000024E, 0x0000028F, + 0x00001E8E, 0x00001EF2, 0x00001EF4, 0x00001EF6, 0x00001EF8, 0x00001EFE, 0x000024CE, + 0x0000FF39, 0x000000FD, 0x000000FF, 0x00000177, 0x000001B4, 0x00000233, 0x0000024F, + 0x0000028E, 0x00001E8F, 0x00001E99, 0x00001EF3, 0x00001EF5, 0x00001EF7, 0x00001EF9, + 0x00001EFF, 0x000024E8, 0x0000FF59, 0x000024B4, 0x00000179, 0x0000017B, 0x0000017D, + 0x000001B5, 0x0000021C, 0x00000224, 0x00001D22, 0x00001E90, 0x00001E92, 0x00001E94, + 0x000024CF, 0x00002C6B, 0x0000A762, 0x0000FF3A, 0x0000017A, 0x0000017C, 0x0000017E, + 0x000001B6, 0x0000021D, 0x00000225, 0x00000240, 0x00000290, 0x00000291, 0x00001D76, + 0x00001D8E, 0x00001E91, 0x00001E93, 0x00001E95, 0x000024E9, 0x00002C6C, 0x0000A763, + 0x0000FF5A, 0x000024B5, 0x00002070, 0x00002080, 0x000024EA, 0x000024FF, 0x0000FF10, + 0x000000B9, 0x00002081, 0x00002460, 0x000024F5, 0x00002776, 0x00002780, 0x0000278A, + 0x0000FF11, 0x00002488, 0x00002474, 0x000000B2, 0x00002082, 0x00002461, 0x000024F6, + 0x00002777, 0x00002781, 0x0000278B, 0x0000FF12, 0x00002489, 0x00002475, 0x000000B3, + 0x00002083, 0x00002462, 0x000024F7, 0x00002778, 0x00002782, 0x0000278C, 0x0000FF13, + 0x0000248A, 0x00002476, 0x00002074, 0x00002084, 0x00002463, 0x000024F8, 0x00002779, + 0x00002783, 0x0000278D, 0x0000FF14, 0x0000248B, 0x00002477, 0x00002075, 0x00002085, + 0x00002464, 0x000024F9, 0x0000277A, 0x00002784, 0x0000278E, 0x0000FF15, 0x0000248C, + 0x00002478, 0x00002076, 0x00002086, 0x00002465, 0x000024FA, 0x0000277B, 0x00002785, + 0x0000278F, 0x0000FF16, 0x0000248D, 0x00002479, 0x00002077, 0x00002087, 0x00002466, + 0x000024FB, 0x0000277C, 0x00002786, 0x00002790, 0x0000FF17, 0x0000248E, 0x0000247A, + 0x00002078, 0x00002088, 0x00002467, 0x000024FC, 0x0000277D, 0x00002787, 0x00002791, + 0x0000FF18, 0x0000248F, 0x0000247B, 0x00002079, 0x00002089, 0x00002468, 0x000024FD, + 0x0000277E, 0x00002788, 0x00002792, 0x0000FF19, 0x00002490, 0x0000247C, 0x00002469, + 0x000024FE, 0x0000277F, 0x00002789, 0x00002793, 0x00002491, 0x0000247D, 0x0000246A, + 0x000024EB, 0x00002492, 0x0000247E, 0x0000246B, 0x000024EC, 0x00002493, 0x0000247F, + 0x0000246C, 0x000024ED, 0x00002494, 0x00002480, 0x0000246D, 0x000024EE, 0x00002495, + 0x00002481, 0x0000246E, 0x000024EF, 0x00002496, 0x00002482, 0x0000246F, 0x000024F0, + 0x00002497, 0x00002483, 0x00002470, 0x000024F1, 0x00002498, 0x00002484, 0x00002471, + 0x000024F2, 0x00002499, 0x00002485, 0x00002472, 0x000024F3, 0x0000249A, 0x00002486, + 0x00002473, 0x000024F4, 0x0000249B, 0x00002487, 0x000000AB, 0x000000BB, 0x0000201C, + 0x0000201D, 0x0000201E, 0x00002033, 0x00002036, 0x0000275D, 0x0000275E, 0x0000276E, + 0x0000276F, 0x0000FF02, 0x00002018, 0x00002019, 0x0000201A, 0x0000201B, 0x00002032, + 0x00002035, 0x00002039, 0x0000203A, 0x0000275B, 0x0000275C, 0x0000FF07, 0x00002010, + 0x00002011, 0x00002012, 0x00002013, 0x00002014, 0x0000207B, 0x0000208B, 0x0000FF0D, + 0x00002045, 0x00002772, 0x0000FF3B, 0x00002046, 0x00002773, 0x0000FF3D, 0x0000207D, + 0x0000208D, 0x00002768, 0x0000276A, 0x0000FF08, 0x00002E28, 0x0000207E, 0x0000208E, + 0x00002769, 0x0000276B, 0x0000FF09, 0x00002E29, 0x0000276C, 0x00002770, 0x0000FF1C, + 0x0000276D, 0x00002771, 0x0000FF1E, 0x00002774, 0x0000FF5B, 0x00002775, 0x0000FF5D, + 0x0000207A, 0x0000208A, 0x0000FF0B, 0x0000207C, 0x0000208C, 0x0000FF1D, 0x0000FF01, + 0x0000203C, 0x00002049, 0x0000FF03, 0x0000FF04, 0x00002052, 0x0000FF05, 0x0000FF06, + 0x0000204E, 0x0000FF0A, 0x0000FF0C, 0x0000FF0E, 0x00002044, 0x0000FF0F, 0x0000FF1A, + 0x0000204F, 0x0000FF1B, 0x0000FF1F, 0x00002047, 0x00002048, 0x0000FF20, 0x0000FF3C, + 0x00002038, 0x0000FF3E, 0x0000FF3F, 0x00002053, 0x0000FF5E}; +static const int replacements[] = {0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000006, + 0x00000007, 0x00000008, 0x00000009, 0x0000000A, 0x0000000B, 0x0000000C, 0x0000000D, + 0x0000000E, 0x0000000F, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, + 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x0000001A, 0x0000001B, + 0x0000001C, 0x0000001D, 0x0000001E, 0x0000001F, 0x00000020, 0x00000021, 0x00000022, + 0x00000023, 0x00000024, 0x00000025, 0x00000026, 0x00000027, 0x00000028, 0x00000029, + 0x0000002A, 0x0000002B, 0x0000002C, 0x0000002D, 0x0000002E, 0x0000002F, 0x00000030, + 0x00000031, 0x00000032, 0x00000033, 0x00000034, 0x00000035, 0x00000036, 0x00000037, + 0x00000038, 0x00000039, 0x0000003A, 0x0000003B, 0x0000003C, 0x0000003D, 0x0000003E, + 0x0000003F, 0x00000040, 0x00000041, 0x00000042, 0x00000043, 0x00000044, 0x00000045, + 0x00000046, 0x00000047, 0x00000048, 0x00000049, 0x0000004A, 0x0000004B, 0x0000004C, + 0x0000004D, 0x0000004E, 0x0000004F, 0x00000050, 0x00000051, 0x00000052, 0x00000053, + 0x00000054, 0x00000055, 0x00000056, 0x00000057, 0x00000058, 0x00000059, 0x0000005A, + 0x0000005B, 0x0000005C, 0x0000005D, 0x0000005E, 0x0000005F, 0x00000060, 0x00000061, + 0x00000062, 0x00000063, 0x00000064, 0x00000065, 0x00000066, 0x00000067, 0x00000068, + 0x00000069, 0x0000006A, 0x0000006B, 0x0000006C, 0x0000006D, 0x0000006E, 0x0000006F, + 0x00000070, 0x00000071, 0x00000072, 0x00000073, 0x00000074, 0x00000075, 0x00000076, + 0x00000077, 0x00000078, 0x00000079, 0x0000007A, 0x0000007B, 0x0000007C, 0x0000007D, + 0x0000007E, 0x004C4544, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, + 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, + 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, + 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, + 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, 0x00000041, + 0x00000041, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00004141, 0x00004541, 0x00004541, 0x00004541, 0x00004541, 0x00004F41, 0x00005541, + 0x00005641, 0x00005641, 0x00005941, 0x00296128, 0x00006161, 0x00006561, 0x00006561, + 0x00006561, 0x00006561, 0x00006F61, 0x00007561, 0x00007661, 0x00007661, 0x00007961, + 0x00000042, 0x00000042, 0x00000042, 0x00000042, 0x00000042, 0x00000042, 0x00000042, + 0x00000042, 0x00000042, 0x00000042, 0x00000062, 0x00000062, 0x00000062, 0x00000062, + 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00296228, + 0x00000043, 0x00000043, 0x00000043, 0x00000043, 0x00000043, 0x00000043, 0x00000043, + 0x00000043, 0x00000043, 0x00000043, 0x00000043, 0x00000043, 0x00000063, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00296328, 0x00000044, + 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, + 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, 0x00000044, + 0x00000044, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00005A44, 0x00005A44, + 0x00007A44, 0x00007A44, 0x00296428, 0x00006264, 0x00007A64, 0x00007A64, 0x00007A64, + 0x00007A64, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, + 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, + 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, + 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, + 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000045, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00296528, 0x00000046, 0x00000046, + 0x00000046, 0x00000046, 0x00000046, 0x00000046, 0x00000046, 0x00000066, 0x00000066, + 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00296628, + 0x00006666, 0x00696666, 0x006C6666, 0x00006966, 0x00006C66, 0x00000047, 0x00000047, + 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, + 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, 0x00000047, + 0x00000047, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00296728, 0x00000048, 0x00000048, 0x00000048, 0x00000048, 0x00000048, + 0x00000048, 0x00000048, 0x00000048, 0x00000048, 0x00000048, 0x00000048, 0x00000048, + 0x00000048, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00005648, 0x00296828, 0x00007668, + 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, + 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, + 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, 0x00000049, + 0x00000049, 0x00000049, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00004A49, 0x00296928, + 0x00006A69, 0x0000004A, 0x0000004A, 0x0000004A, 0x0000004A, 0x0000004A, 0x0000006A, + 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, + 0x0000006A, 0x0000006A, 0x00296A28, 0x0000004B, 0x0000004B, 0x0000004B, 0x0000004B, + 0x0000004B, 0x0000004B, 0x0000004B, 0x0000004B, 0x0000004B, 0x0000004B, 0x0000004B, + 0x0000004B, 0x0000004B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x00296B28, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, + 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, + 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, 0x0000004C, + 0x0000004C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x00004A4C, 0x00004C4C, 0x00006A4C, 0x00296C28, 0x00006A6C, 0x00006C6C, + 0x0000736C, 0x00007A6C, 0x0000004D, 0x0000004D, 0x0000004D, 0x0000004D, 0x0000004D, + 0x0000004D, 0x0000004D, 0x0000004D, 0x0000004D, 0x0000004D, 0x0000006D, 0x0000006D, + 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, + 0x0000006D, 0x00296D28, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, + 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, + 0x0000004E, 0x0000004E, 0x0000004E, 0x0000004E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x00004A4E, 0x00006A4E, 0x00296E28, 0x00006A6E, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, 0x0000004F, + 0x0000004F, 0x0000004F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000454F, 0x0000454F, 0x00004F4F, 0x0000554F, 0x0000554F, 0x00296F28, 0x0000656F, + 0x0000656F, 0x00006F6F, 0x0000756F, 0x00000050, 0x00000050, 0x00000050, 0x00000050, + 0x00000050, 0x00000050, 0x00000050, 0x00000050, 0x00000050, 0x00000050, 0x00000070, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00297028, 0x00000051, 0x00000051, + 0x00000051, 0x00000051, 0x00000051, 0x00000071, 0x00000071, 0x00000071, 0x00000071, + 0x00000071, 0x00000071, 0x00000071, 0x00297128, 0x00007071, 0x00000052, 0x00000052, + 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, + 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, 0x00000052, + 0x00000052, 0x00000052, 0x00000052, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00297228, 0x00000053, 0x00000053, + 0x00000053, 0x00000053, 0x00000053, 0x00000053, 0x00000053, 0x00000053, 0x00000053, + 0x00000053, 0x00000053, 0x00000053, 0x00000053, 0x00000053, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00005353, 0x00297328, 0x00007373, + 0x00007473, 0x00000054, 0x00000054, 0x00000054, 0x00000054, 0x00000054, 0x00000054, + 0x00000054, 0x00000054, 0x00000054, 0x00000054, 0x00000054, 0x00000054, 0x00000054, + 0x00000054, 0x00000054, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00004854, + 0x00004854, 0x00005A54, 0x00297428, 0x00006374, 0x00006874, 0x00006874, 0x00006874, + 0x00007374, 0x00007A74, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, + 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, + 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, + 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, + 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, 0x00000055, + 0x00000055, 0x00000055, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00297528, 0x00006575, 0x00000056, 0x00000056, 0x00000056, + 0x00000056, 0x00000056, 0x00000056, 0x00000056, 0x00000056, 0x00000056, 0x00000056, + 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, + 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00005956, 0x00297628, 0x00007976, + 0x00000057, 0x00000057, 0x00000057, 0x00000057, 0x00000057, 0x00000057, 0x00000057, + 0x00000057, 0x00000057, 0x00000057, 0x00000057, 0x00000077, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00297728, 0x00000058, 0x00000058, 0x00000058, 0x00000058, + 0x00000078, 0x00000078, 0x00000078, 0x00000078, 0x00000078, 0x00000078, 0x00297828, + 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, + 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, 0x00000059, + 0x00000059, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00297928, 0x0000005A, 0x0000005A, 0x0000005A, + 0x0000005A, 0x0000005A, 0x0000005A, 0x0000005A, 0x0000005A, 0x0000005A, 0x0000005A, + 0x0000005A, 0x0000005A, 0x0000005A, 0x0000005A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x00297A28, 0x00000030, 0x00000030, 0x00000030, 0x00000030, 0x00000030, + 0x00000031, 0x00000031, 0x00000031, 0x00000031, 0x00000031, 0x00000031, 0x00000031, + 0x00000031, 0x00002E31, 0x00293128, 0x00000032, 0x00000032, 0x00000032, 0x00000032, + 0x00000032, 0x00000032, 0x00000032, 0x00000032, 0x00002E32, 0x00293228, 0x00000033, + 0x00000033, 0x00000033, 0x00000033, 0x00000033, 0x00000033, 0x00000033, 0x00000033, + 0x00002E33, 0x00293328, 0x00000034, 0x00000034, 0x00000034, 0x00000034, 0x00000034, + 0x00000034, 0x00000034, 0x00000034, 0x00002E34, 0x00293428, 0x00000035, 0x00000035, + 0x00000035, 0x00000035, 0x00000035, 0x00000035, 0x00000035, 0x00000035, 0x00002E35, + 0x00293528, 0x00000036, 0x00000036, 0x00000036, 0x00000036, 0x00000036, 0x00000036, + 0x00000036, 0x00000036, 0x00002E36, 0x00293628, 0x00000037, 0x00000037, 0x00000037, + 0x00000037, 0x00000037, 0x00000037, 0x00000037, 0x00000037, 0x00002E37, 0x00293728, + 0x00000038, 0x00000038, 0x00000038, 0x00000038, 0x00000038, 0x00000038, 0x00000038, + 0x00000038, 0x00002E38, 0x00293828, 0x00000039, 0x00000039, 0x00000039, 0x00000039, + 0x00000039, 0x00000039, 0x00000039, 0x00000039, 0x00002E39, 0x00293928, 0x00003031, + 0x00003031, 0x00003031, 0x00003031, 0x00003031, 0x002E3031, 0x29303128, 0x00003131, + 0x00003131, 0x002E3131, 0x29313128, 0x00003231, 0x00003231, 0x002E3231, 0x29323128, + 0x00003331, 0x00003331, 0x002E3331, 0x29333128, 0x00003431, 0x00003431, 0x002E3431, + 0x29343128, 0x00003531, 0x00003531, 0x002E3531, 0x29353128, 0x00003631, 0x00003631, + 0x002E3631, 0x29363128, 0x00003731, 0x00003731, 0x002E3731, 0x29373128, 0x00003831, + 0x00003831, 0x002E3831, 0x29383128, 0x00003931, 0x00003931, 0x002E3931, 0x29393128, + 0x00003032, 0x00003032, 0x002E3032, 0x29303228, 0x00000022, 0x00000022, 0x00000022, + 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, + 0x00000022, 0x00000022, 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, + 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x0000002D, + 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, + 0x0000005B, 0x0000005B, 0x0000005B, 0x0000005D, 0x0000005D, 0x0000005D, 0x00000028, + 0x00000028, 0x00000028, 0x00000028, 0x00000028, 0x00002828, 0x00000029, 0x00000029, + 0x00000029, 0x00000029, 0x00000029, 0x00002929, 0x0000003C, 0x0000003C, 0x0000003C, + 0x0000003E, 0x0000003E, 0x0000003E, 0x0000007B, 0x0000007B, 0x0000007D, 0x0000007D, + 0x0000002B, 0x0000002B, 0x0000002B, 0x0000003D, 0x0000003D, 0x0000003D, 0x00000021, + 0x00002121, 0x00003F21, 0x00000023, 0x00000024, 0x00000025, 0x00000025, 0x00000026, + 0x0000002A, 0x0000002A, 0x0000002C, 0x0000002E, 0x0000002F, 0x0000002F, 0x0000003A, + 0x0000003B, 0x0000003B, 0x0000003F, 0x00003F3F, 0x0000213F, 0x00000040, 0x0000005C, + 0x0000005E, 0x0000005E, 0x0000005F, 0x0000007E, 0x0000007E}; +static const int replacements_lw[] = {0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, 0x0000000A, 0x0000000B, + 0x0000000C, 0x0000000D, 0x0000000E, 0x0000000F, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015, 0x00000016, 0x00000017, + 0x00000018, 0x00000019, 0x0000001A, 0x0000001B, 0x0000001C, 0x0000001D, + 0x0000001E, 0x0000001F, 0x00000020, 0x00000021, 0x00000022, 0x00000023, + 0x00000024, 0x00000025, 0x00000026, 0x00000027, 0x00000028, 0x00000029, + 0x0000002A, 0x0000002B, 0x0000002C, 0x0000002D, 0x0000002E, 0x0000002F, + 0x00000030, 0x00000031, 0x00000032, 0x00000033, 0x00000034, 0x00000035, + 0x00000036, 0x00000037, 0x00000038, 0x00000039, 0x0000003A, 0x0000003B, + 0x0000003C, 0x0000003D, 0x0000003E, 0x0000003F, 0x00000040, 0x00000061, + 0x00000062, 0x00000063, 0x00000064, 0x00000065, 0x00000066, 0x00000067, + 0x00000068, 0x00000069, 0x0000006A, 0x0000006B, 0x0000006C, 0x0000006D, + 0x0000006E, 0x0000006F, 0x00000070, 0x00000071, 0x00000072, 0x00000073, + 0x00000074, 0x00000075, 0x00000076, 0x00000077, 0x00000078, 0x00000079, + 0x0000007A, 0x0000005B, 0x0000005C, 0x0000005D, 0x0000005E, 0x0000005F, + 0x00000060, 0x00000061, 0x00000062, 0x00000063, 0x00000064, 0x00000065, + 0x00000066, 0x00000067, 0x00000068, 0x00000069, 0x0000006A, 0x0000006B, + 0x0000006C, 0x0000006D, 0x0000006E, 0x0000006F, 0x00000070, 0x00000071, + 0x00000072, 0x00000073, 0x00000074, 0x00000075, 0x00000076, 0x00000077, + 0x00000078, 0x00000079, 0x0000007A, 0x0000007B, 0x0000007C, 0x0000007D, + 0x0000007E, 0x006C6564, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, + 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00000061, 0x00006161, + 0x00006561, 0x00006561, 0x00006561, 0x00006561, 0x00006F61, 0x00007561, + 0x00007661, 0x00007661, 0x00007961, 0x00296128, 0x00006161, 0x00006561, + 0x00006561, 0x00006561, 0x00006561, 0x00006F61, 0x00007561, 0x00007661, + 0x00007661, 0x00007961, 0x00000062, 0x00000062, 0x00000062, 0x00000062, + 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, + 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00000062, + 0x00000062, 0x00000062, 0x00000062, 0x00000062, 0x00296228, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, + 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, 0x00000063, + 0x00000063, 0x00296328, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, 0x00000064, + 0x00007A64, 0x00007A64, 0x00007A64, 0x00007A64, 0x00296428, 0x00006264, + 0x00007A64, 0x00007A64, 0x00007A64, 0x00007A64, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, + 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00000065, 0x00296528, + 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, + 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, 0x00000066, + 0x00000066, 0x00000066, 0x00000066, 0x00296628, 0x00006666, 0x00696666, + 0x006C6666, 0x00006966, 0x00006C66, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00000067, + 0x00000067, 0x00000067, 0x00000067, 0x00000067, 0x00296728, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, + 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00000068, 0x00007668, + 0x00296828, 0x00007668, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, 0x00000069, + 0x00000069, 0x00006A69, 0x00296928, 0x00006A69, 0x0000006A, 0x0000006A, + 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, + 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, 0x0000006A, + 0x0000006A, 0x00296A28, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, + 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x0000006B, 0x00296B28, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, + 0x0000006C, 0x0000006C, 0x0000006C, 0x0000006C, 0x00006A6C, 0x00006C6C, + 0x00006A6C, 0x00296C28, 0x00006A6C, 0x00006C6C, 0x0000736C, 0x00007A6C, + 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, + 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, + 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, 0x0000006D, + 0x0000006D, 0x0000006D, 0x00296D28, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, 0x0000006E, + 0x0000006E, 0x0000006E, 0x0000006E, 0x00006A6E, 0x00006A6E, 0x00296E28, + 0x00006A6E, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, 0x0000006F, + 0x0000006F, 0x0000006F, 0x0000656F, 0x0000656F, 0x00006F6F, 0x0000756F, + 0x0000756F, 0x00296F28, 0x0000656F, 0x0000656F, 0x00006F6F, 0x0000756F, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00000070, + 0x00000070, 0x00000070, 0x00000070, 0x00000070, 0x00297028, 0x00000071, + 0x00000071, 0x00000071, 0x00000071, 0x00000071, 0x00000071, 0x00000071, + 0x00000071, 0x00000071, 0x00000071, 0x00000071, 0x00000071, 0x00297128, + 0x00007071, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, 0x00000072, + 0x00297228, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, + 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00000073, 0x00007373, + 0x00297328, 0x00007373, 0x00007473, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, 0x00000074, + 0x00006874, 0x00006874, 0x00007A74, 0x00297428, 0x00006374, 0x00006874, + 0x00006874, 0x00006874, 0x00007374, 0x00007A74, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, 0x00000075, + 0x00000075, 0x00000075, 0x00297528, 0x00006575, 0x00000076, 0x00000076, + 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, + 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, + 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, 0x00000076, + 0x00000076, 0x00007976, 0x00297628, 0x00007976, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, 0x00000077, + 0x00000077, 0x00000077, 0x00000077, 0x00297728, 0x00000078, 0x00000078, + 0x00000078, 0x00000078, 0x00000078, 0x00000078, 0x00000078, 0x00000078, + 0x00000078, 0x00000078, 0x00297828, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00000079, + 0x00000079, 0x00000079, 0x00000079, 0x00000079, 0x00297928, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, 0x0000007A, + 0x0000007A, 0x00297A28, 0x00000030, 0x00000030, 0x00000030, 0x00000030, + 0x00000030, 0x00000031, 0x00000031, 0x00000031, 0x00000031, 0x00000031, + 0x00000031, 0x00000031, 0x00000031, 0x00002E31, 0x00293128, 0x00000032, + 0x00000032, 0x00000032, 0x00000032, 0x00000032, 0x00000032, 0x00000032, + 0x00000032, 0x00002E32, 0x00293228, 0x00000033, 0x00000033, 0x00000033, + 0x00000033, 0x00000033, 0x00000033, 0x00000033, 0x00000033, 0x00002E33, + 0x00293328, 0x00000034, 0x00000034, 0x00000034, 0x00000034, 0x00000034, + 0x00000034, 0x00000034, 0x00000034, 0x00002E34, 0x00293428, 0x00000035, + 0x00000035, 0x00000035, 0x00000035, 0x00000035, 0x00000035, 0x00000035, + 0x00000035, 0x00002E35, 0x00293528, 0x00000036, 0x00000036, 0x00000036, + 0x00000036, 0x00000036, 0x00000036, 0x00000036, 0x00000036, 0x00002E36, + 0x00293628, 0x00000037, 0x00000037, 0x00000037, 0x00000037, 0x00000037, + 0x00000037, 0x00000037, 0x00000037, 0x00002E37, 0x00293728, 0x00000038, + 0x00000038, 0x00000038, 0x00000038, 0x00000038, 0x00000038, 0x00000038, + 0x00000038, 0x00002E38, 0x00293828, 0x00000039, 0x00000039, 0x00000039, + 0x00000039, 0x00000039, 0x00000039, 0x00000039, 0x00000039, 0x00002E39, + 0x00293928, 0x00003031, 0x00003031, 0x00003031, 0x00003031, 0x00003031, + 0x002E3031, 0x29303128, 0x00003131, 0x00003131, 0x002E3131, 0x29313128, + 0x00003231, 0x00003231, 0x002E3231, 0x29323128, 0x00003331, 0x00003331, + 0x002E3331, 0x29333128, 0x00003431, 0x00003431, 0x002E3431, 0x29343128, + 0x00003531, 0x00003531, 0x002E3531, 0x29353128, 0x00003631, 0x00003631, + 0x002E3631, 0x29363128, 0x00003731, 0x00003731, 0x002E3731, 0x29373128, + 0x00003831, 0x00003831, 0x002E3831, 0x29383128, 0x00003931, 0x00003931, + 0x002E3931, 0x29393128, 0x00003032, 0x00003032, 0x002E3032, 0x29303228, + 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, + 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, 0x00000022, + 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, + 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x00000027, 0x0000002D, + 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, 0x0000002D, + 0x0000002D, 0x0000005B, 0x0000005B, 0x0000005B, 0x0000005D, 0x0000005D, + 0x0000005D, 0x00000028, 0x00000028, 0x00000028, 0x00000028, 0x00000028, + 0x00002828, 0x00000029, 0x00000029, 0x00000029, 0x00000029, 0x00000029, + 0x00002929, 0x0000003C, 0x0000003C, 0x0000003C, 0x0000003E, 0x0000003E, + 0x0000003E, 0x0000007B, 0x0000007B, 0x0000007D, 0x0000007D, 0x0000002B, + 0x0000002B, 0x0000002B, 0x0000003D, 0x0000003D, 0x0000003D, 0x00000021, + 0x00002121, 0x00003F21, 0x00000023, 0x00000024, 0x00000025, 0x00000025, + 0x00000026, 0x0000002A, 0x0000002A, 0x0000002C, 0x0000002E, 0x0000002F, + 0x0000002F, 0x0000003A, 0x0000003B, 0x0000003B, 0x0000003F, 0x00003F3F, + 0x0000213F, 0x00000040, 0x0000005C, 0x0000005E, 0x0000005E, 0x0000005F, + 0x0000007E, 0x0000007E}; + + +static int translate_table[0x10FFFF]; +static int translate_table_lw[0x10FFFF]; + + +void _PG_init() { + for (int i = 0; i < 0x10FFFF; i++) { + + int found = 0; + for (int j = 0; j < (sizeof(offsets) / sizeof(offsets[0])); j++) { + if (offsets[j] == i) { + translate_table[i] = replacements[j]; + translate_table_lw[i] = replacements_lw[j]; + found = 1; + break; + } + } + if (!found) { + int c = i; + + char *cur = (char *) &translate_table[i]; + + if (((utf8_int32_t) 0xfffff800 & c) == 0) { + *(cur++) = 0xc0 | (char) (c >> 6); + *(cur) = 0x80 | (char) (c & 0x3f); + } else if (((utf8_int32_t) 0xffff0000 & c) == 0) { + *(cur++) = 0xe0 | (char) (c >> 12); + *(cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(cur) = 0x80 | (char) (c & 0x3f); + } else { + *(cur++) = 0xf0 | (char) (c >> 18); + *(cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(cur++) = 0x80 | (char) ((c >> 12) & 0x3f); + *(cur) = 0x80 | (char) (c & 0x3f); + } + + cur = (char *) &translate_table_lw[i]; + + c = utf8lwrcodepoint(c); + if (((utf8_int32_t) 0xfffff800 & c) == 0) { + *(cur++) = 0xc0 | (char) (c >> 6); + *(cur) = 0x80 | (char) (c & 0x3f); + } else if (((utf8_int32_t) 0xffff0000 & c) == 0) { + *(cur++) = 0xe0 | (char) (c >> 12); + *(cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(cur) = 0x80 | (char) (c & 0x3f); + } else { + *(cur++) = 0xf0 | (char) (c >> 18); + *(cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(cur++) = 0x80 | (char) ((c >> 12) & 0x3f); + *(cur) = 0x80 | (char) (c & 0x3f); + } + } + } +} + +PG_FUNCTION_INFO_V1(asciifold); + +Datum asciifold(PG_FUNCTION_ARGS) { + const text *text_input = PG_GETARG_TEXT_P(0); + const char *str_input = VARDATA(text_input); + + size_t str_len = VARSIZE(text_input) - VARHDRSZ; + + int c; + const char *ptr = str_input; + const char *end = ptr + str_len; + + text *str_output = palloc(str_len * 2); + + char *cur = str_output->vl_dat; + + do { + ptr = (char *) utf8codepoint(ptr, &c); + + c = translate_table[c]; + + *cur++ = ((char *) &c)[0]; + if (((char *) &c)[1]) { + *cur++ = ((char *) &c)[1]; + if (((char *) &c)[2]) { + *cur++ = ((char *) &c)[2]; + if (((char *) &c)[3]) { *cur++ = ((char *) &c)[3]; } + } + } + + } while (ptr < end); + + SET_VARSIZE(str_output, VARHDRSZ + (cur - str_output->vl_dat)); + + PG_RETURN_TEXT_P(str_output); +} + +PG_FUNCTION_INFO_V1(asciifold_lower); + +Datum asciifold_lower(PG_FUNCTION_ARGS) { + const text *text_input = PG_GETARG_TEXT_P(0); + const char *str_input = VARDATA(text_input); + + size_t str_len = VARSIZE(text_input) - VARHDRSZ; + + int c; + const char *ptr = str_input; + const char *end = ptr + str_len; + + text *str_output = palloc(str_len * 2); + + char *cur = str_output->vl_dat; + + do { + ptr = (char *) utf8codepoint(ptr, &c); + + c = translate_table_lw[c]; + + *cur++ = ((char *) &c)[0]; + if (((char *) &c)[1]) { + *cur++ = ((char *) &c)[1]; + if (((char *) &c)[2]) { + *cur++ = ((char *) &c)[2]; + if (((char *) &c)[3]) { *cur++ = ((char *) &c)[3]; } + } + } + + } while (ptr < end); + + SET_VARSIZE(str_output, VARHDRSZ + (cur - str_output->vl_dat)); + + PG_RETURN_TEXT_P(str_output); +} diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..0a6fefd --- /dev/null +++ b/build.sh @@ -0,0 +1,12 @@ +LIB_PATH="'/usr/lib/libasciifolding.so'" + +POSTGRES_DB="postgres" +POSTGRES_USER="postgres" +cmake . && make && mv ./libasciifolding.so /usr/lib/ + +psql -U $POSTGRES_USER $POSTGRES_DB <'), + (0x3f, '?'), + (0x40, '@'), + (0x41, 'A'), + (0x42, 'B'), + (0x43, 'C'), + (0x44, 'D'), + (0x45, 'E'), + (0x46, 'F'), + (0x47, 'G'), + (0x48, 'H'), + (0x49, 'I'), + (0x4a, 'J'), + (0x4b, 'K'), + (0x4c, 'L'), + (0x4d, 'M'), + (0x4e, 'N'), + (0x4f, 'O'), + (0x50, 'P'), + (0x51, 'Q'), + (0x52, 'R'), + (0x53, 'S'), + (0x54, 'T'), + (0x55, 'U'), + (0x56, 'V'), + (0x57, 'W'), + (0x58, 'X'), + (0x59, 'Y'), + (0x5a, 'Z'), + (0x5b, '['), + (0x5c, '\\'), + (0x5d, ']'), + (0x5e, '^'), + (0x5f, '_'), + (0x60, '`'), + (0x61, 'a'), + (0x62, 'b'), + (0x63, 'c'), + (0x64, 'd'), + (0x65, 'e'), + (0x66, 'f'), + (0x67, 'g'), + (0x68, 'h'), + (0x69, 'i'), + (0x6a, 'j'), + (0x6b, 'k'), + (0x6c, 'l'), + (0x6d, 'm'), + (0x6e, 'n'), + (0x6f, 'o'), + (0x70, 'p'), + (0x71, 'q'), + (0x72, 'r'), + (0x73, 's'), + (0x74, 't'), + (0x75, 'u'), + (0x76, 'v'), + (0x77, 'w'), + (0x78, 'x'), + (0x79, 'y'), + (0x7a, 'z'), + (0x7b, '{'), + (0x7c, '|'), + (0x7d, '}'), + (0x7e, '~'), + (0x7f, 'DEL'), + # 00000000000000000000000 + (0xC0, 'A'), + (0xC1, 'A'), + (0xC2, 'A'), + (0xC3, 'A'), + (0xC4, 'A'), + (0xC5, 'A'), + (0x100, 'A'), + (0x102, 'A'), + (0x104, 'A'), + (0x18F, 'A'), + (0x1CD, 'A'), + (0x1DE, 'A'), + (0x1E0, 'A'), + (0x1FA, 'A'), + (0x200, 'A'), + (0x202, 'A'), + (0x226, 'A'), + (0x23A, 'A'), + (0x1D00, 'A'), + (0x1E00, 'A'), + (0x1EA0, 'A'), + (0x1EA2, 'A'), + (0x1EA4, 'A'), + (0x1EA6, 'A'), + (0x1EA8, 'A'), + (0x1EAA, 'A'), + (0x1EAC, 'A'), + (0x1EAE, 'A'), + (0x1EB0, 'A'), + (0x1EB2, 'A'), + (0x1EB4, 'A'), + (0x1EB6, 'A'), + (0x24B6, 'A'), + (0xFF21, 'A'), + (0xE0, 'a'), + (0xE1, 'a'), + (0xE2, 'a'), + (0xE3, 'a'), + (0xE4, 'a'), + (0xE5, 'a'), + (0x101, 'a'), + (0x103, 'a'), + (0x105, 'a'), + (0x1CE, 'a'), + (0x1DF, 'a'), + (0x1E1, 'a'), + (0x1FB, 'a'), + (0x201, 'a'), + (0x203, 'a'), + (0x227, 'a'), + (0x250, 'a'), + (0x259, 'a'), + (0x25A, 'a'), + (0x1D8F, 'a'), + (0x1D95, 'a'), + (0x1E01, 'a'), + (0x1E9A, 'a'), + (0x1EA1, 'a'), + (0x1EA3, 'a'), + (0x1EA5, 'a'), + (0x1EA7, 'a'), + (0x1EA9, 'a'), + (0x1EAB, 'a'), + (0x1EAD, 'a'), + (0x1EAF, 'a'), + (0x1EB1, 'a'), + (0x1EB3, 'a'), + (0x1EB5, 'a'), + (0x1EB7, 'a'), + (0x2090, 'a'), + (0x2094, 'a'), + (0x24D0, 'a'), + (0x2C65, 'a'), + (0x2C6F, 'a'), + (0xFF41, 'a'), + (0xA732, 'AA'), + (0xC6, 'AE'), + (0x1E2, 'AE'), + (0x1FC, 'AE'), + (0x1D01, 'AE'), + (0xA734, 'AO'), + (0xA736, 'AU'), + (0xA738, 'AV'), + (0xA73A, 'AV'), + (0xA73C, 'AY'), + (0x249C, '(a)'), + (0xA733, 'aa'), + (0xE6, 'ae'), + (0x1E3, 'ae'), + (0x1FD, 'ae'), + (0x1D02, 'ae'), + (0xA735, 'ao'), + (0xA737, 'au'), + (0xA739, 'av'), + (0xA73B, 'av'), + (0xA73D, 'ay'), + (0x181, 'B'), + (0x182, 'B'), + (0x243, 'B'), + (0x299, 'B'), + (0x1D03, 'B'), + (0x1E02, 'B'), + (0x1E04, 'B'), + (0x1E06, 'B'), + (0x24B7, 'B'), + (0xFF22, 'B'), + (0x180, 'b'), + (0x183, 'b'), + (0x253, 'b'), + (0x1D6C, 'b'), + (0x1D80, 'b'), + (0x1E03, 'b'), + (0x1E05, 'b'), + (0x1E07, 'b'), + (0x24D1, 'b'), + (0xFF42, 'b'), + (0x249D, '(b)'), + (0xC7, 'C'), + (0x106, 'C'), + (0x108, 'C'), + (0x10A, 'C'), + (0x10C, 'C'), + (0x187, 'C'), + (0x23B, 'C'), + (0x297, 'C'), + (0x1D04, 'C'), + (0x1E08, 'C'), + (0x24B8, 'C'), + (0xFF23, 'C'), + (0xE7, 'c'), + (0x107, 'c'), + (0x109, 'c'), + (0x10B, 'c'), + (0x10D, 'c'), + (0x188, 'c'), + (0x23C, 'c'), + (0x255, 'c'), + (0x1E09, 'c'), + (0x2184, 'c'), + (0x24D2, 'c'), + (0xA73E, 'c'), + (0xA73F, 'c'), + (0xFF43, 'c'), + (0x249E, '(c)'), + (0xD0, 'D'), + (0x10E, 'D'), + (0x110, 'D'), + (0x189, 'D'), + (0x18A, 'D'), + (0x18B, 'D'), + (0x1D05, 'D'), + (0x1D06, 'D'), + (0x1E0A, 'D'), + (0x1E0C, 'D'), + (0x1E0E, 'D'), + (0x1E10, 'D'), + (0x1E12, 'D'), + (0x24B9, 'D'), + (0xA779, 'D'), + (0xFF24, 'D'), + (0xF0, 'd'), + (0x10F, 'd'), + (0x111, 'd'), + (0x18C, 'd'), + (0x221, 'd'), + (0x256, 'd'), + (0x257, 'd'), + (0x1D6D, 'd'), + (0x1D81, 'd'), + (0x1D91, 'd'), + (0x1E0B, 'd'), + (0x1E0D, 'd'), + (0x1E0F, 'd'), + (0x1E11, 'd'), + (0x1E13, 'd'), + (0x24D3, 'd'), + (0xA77A, 'd'), + (0xFF44, 'd'), + (0x1C4, 'DZ'), + (0x1F1, 'DZ'), + (0x1C5, 'Dz'), + (0x1F2, 'Dz'), + (0x249F, '(d)'), + (0x238, 'db'), + (0x1C6, 'dz'), + (0x1F3, 'dz'), + (0x2A3, 'dz'), + (0x2A5, 'dz'), + (0xC8, 'E'), + (0xC9, 'E'), + (0xCA, 'E'), + (0xCB, 'E'), + (0x112, 'E'), + (0x114, 'E'), + (0x116, 'E'), + (0x118, 'E'), + (0x11A, 'E'), + (0x18E, 'E'), + (0x190, 'E'), + (0x204, 'E'), + (0x206, 'E'), + (0x228, 'E'), + (0x246, 'E'), + (0x1D07, 'E'), + (0x1E14, 'E'), + (0x1E16, 'E'), + (0x1E18, 'E'), + (0x1E1A, 'E'), + (0x1E1C, 'E'), + (0x1EB8, 'E'), + (0x1EBA, 'E'), + (0x1EBC, 'E'), + (0x1EBE, 'E'), + (0x1EC0, 'E'), + (0x1EC2, 'E'), + (0x1EC4, 'E'), + (0x1EC6, 'E'), + (0x24BA, 'E'), + (0x2C7B, 'E'), + (0xFF25, 'E'), + (0xE8, 'e'), + (0xE9, 'e'), + (0xEA, 'e'), + (0xEB, 'e'), + (0x113, 'e'), + (0x115, 'e'), + (0x117, 'e'), + (0x119, 'e'), + (0x11B, 'e'), + (0x1DD, 'e'), + (0x205, 'e'), + (0x207, 'e'), + (0x229, 'e'), + (0x247, 'e'), + (0x258, 'e'), + (0x25B, 'e'), + (0x25C, 'e'), + (0x25D, 'e'), + (0x25E, 'e'), + (0x29A, 'e'), + (0x1D08, 'e'), + (0x1D92, 'e'), + (0x1D93, 'e'), + (0x1D94, 'e'), + (0x1E15, 'e'), + (0x1E17, 'e'), + (0x1E19, 'e'), + (0x1E1B, 'e'), + (0x1E1D, 'e'), + (0x1EB9, 'e'), + (0x1EBB, 'e'), + (0x1EBD, 'e'), + (0x1EBF, 'e'), + (0x1EC1, 'e'), + (0x1EC3, 'e'), + (0x1EC5, 'e'), + (0x1EC7, 'e'), + (0x2091, 'e'), + (0x24D4, 'e'), + (0x2C78, 'e'), + (0xFF45, 'e'), + (0x24A0, '(e)'), + (0x191, 'F'), + (0x1E1E, 'F'), + (0x24BB, 'F'), + (0xA730, 'F'), + (0xA77B, 'F'), + (0xA7FB, 'F'), + (0xFF26, 'F'), + (0x192, 'f'), + (0x1D6E, 'f'), + (0x1D82, 'f'), + (0x1E1F, 'f'), + (0x1E9B, 'f'), + (0x24D5, 'f'), + (0xA77C, 'f'), + (0xFF46, 'f'), + (0x24A1, '(f)'), + (0xFB00, 'ff'), + (0xFB03, 'ffi'), + (0xFB04, 'ffl'), + (0xFB01, 'fi'), + (0xFB02, 'fl'), + (0x11C, 'G'), + (0x11E, 'G'), + (0x120, 'G'), + (0x122, 'G'), + (0x193, 'G'), + (0x1E4, 'G'), + (0x1E5, 'G'), + (0x1E6, 'G'), + (0x1E7, 'G'), + (0x1F4, 'G'), + (0x262, 'G'), + (0x29B, 'G'), + (0x1E20, 'G'), + (0x24BC, 'G'), + (0xA77D, 'G'), + (0xA77E, 'G'), + (0xFF27, 'G'), + (0x11D, 'g'), + (0x11F, 'g'), + (0x121, 'g'), + (0x123, 'g'), + (0x1F5, 'g'), + (0x260, 'g'), + (0x261, 'g'), + (0x1D77, 'g'), + (0x1D79, 'g'), + (0x1D83, 'g'), + (0x1E21, 'g'), + (0x24D6, 'g'), + (0xA77F, 'g'), + (0xFF47, 'g'), + (0x24A2, '(g)'), + (0x124, 'H'), + (0x126, 'H'), + (0x21E, 'H'), + (0x29C, 'H'), + (0x1E22, 'H'), + (0x1E24, 'H'), + (0x1E26, 'H'), + (0x1E28, 'H'), + (0x1E2A, 'H'), + (0x24BD, 'H'), + (0x2C67, 'H'), + (0x2C75, 'H'), + (0xFF28, 'H'), + (0x125, 'h'), + (0x127, 'h'), + (0x21F, 'h'), + (0x265, 'h'), + (0x266, 'h'), + (0x2AE, 'h'), + (0x2AF, 'h'), + (0x1E23, 'h'), + (0x1E25, 'h'), + (0x1E27, 'h'), + (0x1E29, 'h'), + (0x1E2B, 'h'), + (0x1E96, 'h'), + (0x24D7, 'h'), + (0x2C68, 'h'), + (0x2C76, 'h'), + (0xFF48, 'h'), + (0x1F6, 'HV'), + (0x24A3, '(h)'), + (0x195, 'hv'), + (0xCC, 'I'), + (0xCD, 'I'), + (0xCE, 'I'), + (0xCF, 'I'), + (0x128, 'I'), + (0x12A, 'I'), + (0x12C, 'I'), + (0x12E, 'I'), + (0x130, 'I'), + (0x196, 'I'), + (0x197, 'I'), + (0x1CF, 'I'), + (0x208, 'I'), + (0x20A, 'I'), + (0x26A, 'I'), + (0x1D7B, 'I'), + (0x1E2C, 'I'), + (0x1E2E, 'I'), + (0x1EC8, 'I'), + (0x1ECA, 'I'), + (0x24BE, 'I'), + (0xA7FE, 'I'), + (0xFF29, 'I'), + (0xEC, 'i'), + (0xED, 'i'), + (0xEE, 'i'), + (0xEF, 'i'), + (0x129, 'i'), + (0x12B, 'i'), + (0x12D, 'i'), + (0x12F, 'i'), + (0x131, 'i'), + (0x1D0, 'i'), + (0x209, 'i'), + (0x20B, 'i'), + (0x268, 'i'), + (0x1D09, 'i'), + (0x1D62, 'i'), + (0x1D7C, 'i'), + (0x1D96, 'i'), + (0x1E2D, 'i'), + (0x1E2F, 'i'), + (0x1EC9, 'i'), + (0x1ECB, 'i'), + (0x2071, 'i'), + (0x24D8, 'i'), + (0xFF49, 'i'), + (0x132, 'IJ'), + (0x24A4, '(i)'), + (0x133, 'ij'), + (0x134, 'J'), + (0x248, 'J'), + (0x1D0A, 'J'), + (0x24BF, 'J'), + (0xFF2A, 'J'), + (0x135, 'j'), + (0x1F0, 'j'), + (0x237, 'j'), + (0x249, 'j'), + (0x25F, 'j'), + (0x284, 'j'), + (0x29D, 'j'), + (0x24D9, 'j'), + (0x2C7C, 'j'), + (0xFF4A, 'j'), + (0x24A5, '(j)'), + (0x136, 'K'), + (0x198, 'K'), + (0x1E8, 'K'), + (0x1D0B, 'K'), + (0x1E30, 'K'), + (0x1E32, 'K'), + (0x1E34, 'K'), + (0x24C0, 'K'), + (0x2C69, 'K'), + (0xA740, 'K'), + (0xA742, 'K'), + (0xA744, 'K'), + (0xFF2B, 'K'), + (0x137, 'k'), + (0x199, 'k'), + (0x1E9, 'k'), + (0x29E, 'k'), + (0x1D84, 'k'), + (0x1E31, 'k'), + (0x1E33, 'k'), + (0x1E35, 'k'), + (0x24DA, 'k'), + (0x2C6A, 'k'), + (0xA741, 'k'), + (0xA743, 'k'), + (0xA745, 'k'), + (0xFF4B, 'k'), + (0x24A6, '(k)'), + (0x139, 'L'), + (0x13B, 'L'), + (0x13D, 'L'), + (0x13F, 'L'), + (0x141, 'L'), + (0x23D, 'L'), + (0x29F, 'L'), + (0x1D0C, 'L'), + (0x1E36, 'L'), + (0x1E38, 'L'), + (0x1E3A, 'L'), + (0x1E3C, 'L'), + (0x24C1, 'L'), + (0x2C60, 'L'), + (0x2C62, 'L'), + (0xA746, 'L'), + (0xA748, 'L'), + (0xA780, 'L'), + (0xFF2C, 'L'), + (0x13A, 'l'), + (0x13C, 'l'), + (0x13E, 'l'), + (0x140, 'l'), + (0x142, 'l'), + (0x19A, 'l'), + (0x234, 'l'), + (0x26B, 'l'), + (0x26C, 'l'), + (0x26D, 'l'), + (0x1D85, 'l'), + (0x1E37, 'l'), + (0x1E39, 'l'), + (0x1E3B, 'l'), + (0x1E3D, 'l'), + (0x24DB, 'l'), + (0x2C61, 'l'), + (0xA747, 'l'), + (0xA749, 'l'), + (0xA781, 'l'), + (0xFF4C, 'l'), + (0x1C7, 'LJ'), + (0x1EFA, 'LL'), + (0x1C8, 'Lj'), + (0x24A7, '(l)'), + (0x1C9, 'lj'), + (0x1EFB, 'll'), + (0x2AA, 'ls'), + (0x2AB, 'lz'), + (0x19C, 'M'), + (0x1D0D, 'M'), + (0x1E3E, 'M'), + (0x1E40, 'M'), + (0x1E42, 'M'), + (0x24C2, 'M'), + (0x2C6E, 'M'), + (0xA7FD, 'M'), + (0xA7FF, 'M'), + (0xFF2D, 'M'), + (0x26F, 'm'), + (0x270, 'm'), + (0x271, 'm'), + (0x1D6F, 'm'), + (0x1D86, 'm'), + (0x1E3F, 'm'), + (0x1E41, 'm'), + (0x1E43, 'm'), + (0x24DC, 'm'), + (0xFF4D, 'm'), + (0x24A8, '(m)'), + (0xD1, 'N'), + (0x143, 'N'), + (0x145, 'N'), + (0x147, 'N'), + (0x14A, 'N'), + (0x19D, 'N'), + (0x1F8, 'N'), + (0x220, 'N'), + (0x274, 'N'), + (0x1D0E, 'N'), + (0x1E44, 'N'), + (0x1E46, 'N'), + (0x1E48, 'N'), + (0x1E4A, 'N'), + (0x24C3, 'N'), + (0xFF2E, 'N'), + (0xF1, 'n'), + (0x144, 'n'), + (0x146, 'n'), + (0x148, 'n'), + (0x149, 'n'), + (0x14B, 'n'), + (0x19E, 'n'), + (0x1F9, 'n'), + (0x235, 'n'), + (0x272, 'n'), + (0x273, 'n'), + (0x1D70, 'n'), + (0x1D87, 'n'), + (0x1E45, 'n'), + (0x1E47, 'n'), + (0x1E49, 'n'), + (0x1E4B, 'n'), + (0x207F, 'n'), + (0x24DD, 'n'), + (0xFF4E, 'n'), + (0x1CA, 'NJ'), + (0x1CB, 'Nj'), + (0x24A9, '(n)'), + (0x1CC, 'nj'), + (0xD2, 'O'), + (0xD3, 'O'), + (0xD4, 'O'), + (0xD5, 'O'), + (0xD6, 'O'), + (0xD8, 'O'), + (0x14C, 'O'), + (0x14E, 'O'), + (0x150, 'O'), + (0x186, 'O'), + (0x19F, 'O'), + (0x1A0, 'O'), + (0x1D1, 'O'), + (0x1EA, 'O'), + (0x1EC, 'O'), + (0x1FE, 'O'), + (0x20C, 'O'), + (0x20E, 'O'), + (0x22A, 'O'), + (0x22C, 'O'), + (0x22E, 'O'), + (0x230, 'O'), + (0x1D0F, 'O'), + (0x1D10, 'O'), + (0x1E4C, 'O'), + (0x1E4E, 'O'), + (0x1E50, 'O'), + (0x1E52, 'O'), + (0x1ECC, 'O'), + (0x1ECE, 'O'), + (0x1ED0, 'O'), + (0x1ED2, 'O'), + (0x1ED4, 'O'), + (0x1ED6, 'O'), + (0x1ED8, 'O'), + (0x1EDA, 'O'), + (0x1EDC, 'O'), + (0x1EDE, 'O'), + (0x1EE0, 'O'), + (0x1EE2, 'O'), + (0x24C4, 'O'), + (0xA74A, 'O'), + (0xA74C, 'O'), + (0xFF2F, 'O'), + (0xF2, 'o'), + (0xF3, 'o'), + (0xF4, 'o'), + (0xF5, 'o'), + (0xF6, 'o'), + (0xF8, 'o'), + (0x14D, 'o'), + (0x14F, 'o'), + (0x151, 'o'), + (0x1A1, 'o'), + (0x1D2, 'o'), + (0x1EB, 'o'), + (0x1ED, 'o'), + (0x1FF, 'o'), + (0x20D, 'o'), + (0x20F, 'o'), + (0x22B, 'o'), + (0x22D, 'o'), + (0x22F, 'o'), + (0x231, 'o'), + (0x254, 'o'), + (0x275, 'o'), + (0x1D16, 'o'), + (0x1D17, 'o'), + (0x1D97, 'o'), + (0x1E4D, 'o'), + (0x1E4F, 'o'), + (0x1E51, 'o'), + (0x1E53, 'o'), + (0x1ECD, 'o'), + (0x1ECF, 'o'), + (0x1ED1, 'o'), + (0x1ED3, 'o'), + (0x1ED5, 'o'), + (0x1ED7, 'o'), + (0x1ED9, 'o'), + (0x1EDB, 'o'), + (0x1EDD, 'o'), + (0x1EDF, 'o'), + (0x1EE1, 'o'), + (0x1EE3, 'o'), + (0x2092, 'o'), + (0x24DE, 'o'), + (0x2C7A, 'o'), + (0xA74B, 'o'), + (0xA74D, 'o'), + (0xFF4F, 'o'), + (0x152, 'OE'), + (0x276, 'OE'), + (0xA74E, 'OO'), + (0x222, 'OU'), + (0x1D15, 'OU'), + (0x24AA, '(o)'), + (0x153, 'oe'), + (0x1D14, 'oe'), + (0xA74F, 'oo'), + (0x223, 'ou'), + (0x1A4, 'P'), + (0x1D18, 'P'), + (0x1E54, 'P'), + (0x1E56, 'P'), + (0x24C5, 'P'), + (0x2C63, 'P'), + (0xA750, 'P'), + (0xA752, 'P'), + (0xA754, 'P'), + (0xFF30, 'P'), + (0x1A5, 'p'), + (0x1D71, 'p'), + (0x1D7D, 'p'), + (0x1D88, 'p'), + (0x1E55, 'p'), + (0x1E57, 'p'), + (0x24DF, 'p'), + (0xA751, 'p'), + (0xA753, 'p'), + (0xA755, 'p'), + (0xA7FC, 'p'), + (0xFF50, 'p'), + (0x24AB, '(p)'), + (0x24A, 'Q'), + (0x24C6, 'Q'), + (0xA756, 'Q'), + (0xA758, 'Q'), + (0xFF31, 'Q'), + (0x138, 'q'), + (0x24B, 'q'), + (0x2A0, 'q'), + (0x24E0, 'q'), + (0xA757, 'q'), + (0xA759, 'q'), + (0xFF51, 'q'), + (0x24AC, '(q)'), + (0x239, 'qp'), + (0x154, 'R'), + (0x156, 'R'), + (0x158, 'R'), + (0x210, 'R'), + (0x212, 'R'), + (0x24C, 'R'), + (0x280, 'R'), + (0x281, 'R'), + (0x1D19, 'R'), + (0x1D1A, 'R'), + (0x1E58, 'R'), + (0x1E5A, 'R'), + (0x1E5C, 'R'), + (0x1E5E, 'R'), + (0x24C7, 'R'), + (0x2C64, 'R'), + (0xA75A, 'R'), + (0xA782, 'R'), + (0xFF32, 'R'), + (0x155, 'r'), + (0x157, 'r'), + (0x159, 'r'), + (0x211, 'r'), + (0x213, 'r'), + (0x24D, 'r'), + (0x27C, 'r'), + (0x27D, 'r'), + (0x27E, 'r'), + (0x27F, 'r'), + (0x1D63, 'r'), + (0x1D72, 'r'), + (0x1D73, 'r'), + (0x1D89, 'r'), + (0x1E59, 'r'), + (0x1E5B, 'r'), + (0x1E5D, 'r'), + (0x1E5F, 'r'), + (0x24E1, 'r'), + (0xA75B, 'r'), + (0xA783, 'r'), + (0xFF52, 'r'), + (0x24AD, '(r)'), + (0x15A, 'S'), + (0x15C, 'S'), + (0x15E, 'S'), + (0x160, 'S'), + (0x218, 'S'), + (0x1E60, 'S'), + (0x1E62, 'S'), + (0x1E64, 'S'), + (0x1E66, 'S'), + (0x1E68, 'S'), + (0x24C8, 'S'), + (0xA731, 'S'), + (0xA785, 'S'), + (0xFF33, 'S'), + (0x15B, 's'), + (0x15D, 's'), + (0x15F, 's'), + (0x161, 's'), + (0x17F, 's'), + (0x219, 's'), + (0x23F, 's'), + (0x282, 's'), + (0x1D74, 's'), + (0x1D8A, 's'), + (0x1E61, 's'), + (0x1E63, 's'), + (0x1E65, 's'), + (0x1E67, 's'), + (0x1E69, 's'), + (0x1E9C, 's'), + (0x1E9D, 's'), + (0x24E2, 's'), + (0xA784, 's'), + (0xFF53, 's'), + (0x1E9E, 'SS'), + (0x24AE, '(s)'), + (0xDF, 'ss'), + (0xFB06, 'st'), + (0x162, 'T'), + (0x164, 'T'), + (0x166, 'T'), + (0x1AC, 'T'), + (0x1AE, 'T'), + (0x21A, 'T'), + (0x23E, 'T'), + (0x1D1B, 'T'), + (0x1E6A, 'T'), + (0x1E6C, 'T'), + (0x1E6E, 'T'), + (0x1E70, 'T'), + (0x24C9, 'T'), + (0xA786, 'T'), + (0xFF34, 'T'), + (0x163, 't'), + (0x165, 't'), + (0x167, 't'), + (0x1AB, 't'), + (0x1AD, 't'), + (0x21B, 't'), + (0x236, 't'), + (0x287, 't'), + (0x288, 't'), + (0x1D75, 't'), + (0x1E6B, 't'), + (0x1E6D, 't'), + (0x1E6F, 't'), + (0x1E71, 't'), + (0x1E97, 't'), + (0x24E3, 't'), + (0x2C66, 't'), + (0xFF54, 't'), + (0xDE, 'TH'), + (0xA766, 'TH'), + (0xA728, 'TZ'), + (0x24AF, '(t)'), + (0x2A8, 'tc'), + (0xFE, 'th'), + (0x1D7A, 'th'), + (0xA767, 'th'), + (0x2A6, 'ts'), + (0xA729, 'tz'), + (0xD9, 'U'), + (0xDA, 'U'), + (0xDB, 'U'), + (0xDC, 'U'), + (0x168, 'U'), + (0x16A, 'U'), + (0x16C, 'U'), + (0x16E, 'U'), + (0x170, 'U'), + (0x172, 'U'), + (0x1AF, 'U'), + (0x1D3, 'U'), + (0x1D5, 'U'), + (0x1D7, 'U'), + (0x1D9, 'U'), + (0x1DB, 'U'), + (0x214, 'U'), + (0x216, 'U'), + (0x244, 'U'), + (0x1D1C, 'U'), + (0x1D7E, 'U'), + (0x1E72, 'U'), + (0x1E74, 'U'), + (0x1E76, 'U'), + (0x1E78, 'U'), + (0x1E7A, 'U'), + (0x1EE4, 'U'), + (0x1EE6, 'U'), + (0x1EE8, 'U'), + (0x1EEA, 'U'), + (0x1EEC, 'U'), + (0x1EEE, 'U'), + (0x1EF0, 'U'), + (0x24CA, 'U'), + (0xFF35, 'U'), + (0xF9, 'u'), + (0xFA, 'u'), + (0xFB, 'u'), + (0xFC, 'u'), + (0x169, 'u'), + (0x16B, 'u'), + (0x16D, 'u'), + (0x16F, 'u'), + (0x171, 'u'), + (0x173, 'u'), + (0x1B0, 'u'), + (0x1D4, 'u'), + (0x1D6, 'u'), + (0x1D8, 'u'), + (0x1DA, 'u'), + (0x1DC, 'u'), + (0x215, 'u'), + (0x217, 'u'), + (0x289, 'u'), + (0x1D64, 'u'), + (0x1D99, 'u'), + (0x1E73, 'u'), + (0x1E75, 'u'), + (0x1E77, 'u'), + (0x1E79, 'u'), + (0x1E7B, 'u'), + (0x1EE5, 'u'), + (0x1EE7, 'u'), + (0x1EE9, 'u'), + (0x1EEB, 'u'), + (0x1EED, 'u'), + (0x1EEF, 'u'), + (0x1EF1, 'u'), + (0x24E4, 'u'), + (0xFF55, 'u'), + (0x24B0, '(u)'), + (0x1D6B, 'ue'), + (0x1B2, 'V'), + (0x245, 'V'), + (0x1D20, 'V'), + (0x1E7C, 'V'), + (0x1E7E, 'V'), + (0x1EFC, 'V'), + (0x24CB, 'V'), + (0xA75E, 'V'), + (0xA768, 'V'), + (0xFF36, 'V'), + (0x28B, 'v'), + (0x28C, 'v'), + (0x1D65, 'v'), + (0x1D8C, 'v'), + (0x1E7D, 'v'), + (0x1E7F, 'v'), + (0x24E5, 'v'), + (0x2C71, 'v'), + (0x2C74, 'v'), + (0xA75F, 'v'), + (0xFF56, 'v'), + (0xA760, 'VY'), + (0x24B1, '(v)'), + (0xA761, 'vy'), + (0x174, 'W'), + (0x1F7, 'W'), + (0x1D21, 'W'), + (0x1E80, 'W'), + (0x1E82, 'W'), + (0x1E84, 'W'), + (0x1E86, 'W'), + (0x1E88, 'W'), + (0x24CC, 'W'), + (0x2C72, 'W'), + (0xFF37, 'W'), + (0x175, 'w'), + (0x1BF, 'w'), + (0x28D, 'w'), + (0x1E81, 'w'), + (0x1E83, 'w'), + (0x1E85, 'w'), + (0x1E87, 'w'), + (0x1E89, 'w'), + (0x1E98, 'w'), + (0x24E6, 'w'), + (0x2C73, 'w'), + (0xFF57, 'w'), + (0x24B2, '(w)'), + (0x1E8A, 'X'), + (0x1E8C, 'X'), + (0x24CD, 'X'), + (0xFF38, 'X'), + (0x1D8D, 'x'), + (0x1E8B, 'x'), + (0x1E8D, 'x'), + (0x2093, 'x'), + (0x24E7, 'x'), + (0xFF58, 'x'), + (0x24B3, '(x)'), + (0xDD, 'Y'), + (0x176, 'Y'), + (0x178, 'Y'), + (0x1B3, 'Y'), + (0x232, 'Y'), + (0x24E, 'Y'), + (0x28F, 'Y'), + (0x1E8E, 'Y'), + (0x1EF2, 'Y'), + (0x1EF4, 'Y'), + (0x1EF6, 'Y'), + (0x1EF8, 'Y'), + (0x1EFE, 'Y'), + (0x24CE, 'Y'), + (0xFF39, 'Y'), + (0xFD, 'y'), + (0xFF, 'y'), + (0x177, 'y'), + (0x1B4, 'y'), + (0x233, 'y'), + (0x24F, 'y'), + (0x28E, 'y'), + (0x1E8F, 'y'), + (0x1E99, 'y'), + (0x1EF3, 'y'), + (0x1EF5, 'y'), + (0x1EF7, 'y'), + (0x1EF9, 'y'), + (0x1EFF, 'y'), + (0x24E8, 'y'), + (0xFF59, 'y'), + (0x24B4, '(y)'), + (0x179, 'Z'), + (0x17B, 'Z'), + (0x17D, 'Z'), + (0x1B5, 'Z'), + (0x21C, 'Z'), + (0x224, 'Z'), + (0x1D22, 'Z'), + (0x1E90, 'Z'), + (0x1E92, 'Z'), + (0x1E94, 'Z'), + (0x24CF, 'Z'), + (0x2C6B, 'Z'), + (0xA762, 'Z'), + (0xFF3A, 'Z'), + (0x17A, 'z'), + (0x17C, 'z'), + (0x17E, 'z'), + (0x1B6, 'z'), + (0x21D, 'z'), + (0x225, 'z'), + (0x240, 'z'), + (0x290, 'z'), + (0x291, 'z'), + (0x1D76, 'z'), + (0x1D8E, 'z'), + (0x1E91, 'z'), + (0x1E93, 'z'), + (0x1E95, 'z'), + (0x24E9, 'z'), + (0x2C6C, 'z'), + (0xA763, 'z'), + (0xFF5A, 'z'), + (0x24B5, '(z)'), + (0x2070, '0'), + (0x2080, '0'), + (0x24EA, '0'), + (0x24FF, '0'), + (0xFF10, '0'), + (0xB9, '1'), + (0x2081, '1'), + (0x2460, '1'), + (0x24F5, '1'), + (0x2776, '1'), + (0x2780, '1'), + (0x278A, '1'), + (0xFF11, '1'), + (0x2488, '1.'), + (0x2474, '(1)'), + (0xB2, '2'), + (0x2082, '2'), + (0x2461, '2'), + (0x24F6, '2'), + (0x2777, '2'), + (0x2781, '2'), + (0x278B, '2'), + (0xFF12, '2'), + (0x2489, '2.'), + (0x2475, '(2)'), + (0xB3, '3'), + (0x2083, '3'), + (0x2462, '3'), + (0x24F7, '3'), + (0x2778, '3'), + (0x2782, '3'), + (0x278C, '3'), + (0xFF13, '3'), + (0x248A, '3.'), + (0x2476, '(3)'), + (0x2074, '4'), + (0x2084, '4'), + (0x2463, '4'), + (0x24F8, '4'), + (0x2779, '4'), + (0x2783, '4'), + (0x278D, '4'), + (0xFF14, '4'), + (0x248B, '4.'), + (0x2477, '(4)'), + (0x2075, '5'), + (0x2085, '5'), + (0x2464, '5'), + (0x24F9, '5'), + (0x277A, '5'), + (0x2784, '5'), + (0x278E, '5'), + (0xFF15, '5'), + (0x248C, '5.'), + (0x2478, '(5)'), + (0x2076, '6'), + (0x2086, '6'), + (0x2465, '6'), + (0x24FA, '6'), + (0x277B, '6'), + (0x2785, '6'), + (0x278F, '6'), + (0xFF16, '6'), + (0x248D, '6.'), + (0x2479, '(6)'), + (0x2077, '7'), + (0x2087, '7'), + (0x2466, '7'), + (0x24FB, '7'), + (0x277C, '7'), + (0x2786, '7'), + (0x2790, '7'), + (0xFF17, '7'), + (0x248E, '7.'), + (0x247A, '(7)'), + (0x2078, '8'), + (0x2088, '8'), + (0x2467, '8'), + (0x24FC, '8'), + (0x277D, '8'), + (0x2787, '8'), + (0x2791, '8'), + (0xFF18, '8'), + (0x248F, '8.'), + (0x247B, '(8)'), + (0x2079, '9'), + (0x2089, '9'), + (0x2468, '9'), + (0x24FD, '9'), + (0x277E, '9'), + (0x2788, '9'), + (0x2792, '9'), + (0xFF19, '9'), + (0x2490, '9.'), + (0x247C, '(9)'), + (0x2469, '10'), + (0x24FE, '10'), + (0x277F, '10'), + (0x2789, '10'), + (0x2793, '10'), + (0x2491, '10.'), + (0x247D, '(10)'), + (0x246A, '11'), + (0x24EB, '11'), + (0x2492, '11.'), + (0x247E, '(11)'), + (0x246B, '12'), + (0x24EC, '12'), + (0x2493, '12.'), + (0x247F, '(12)'), + (0x246C, '13'), + (0x24ED, '13'), + (0x2494, '13.'), + (0x2480, '(13)'), + (0x246D, '14'), + (0x24EE, '14'), + (0x2495, '14.'), + (0x2481, '(14)'), + (0x246E, '15'), + (0x24EF, '15'), + (0x2496, '15.'), + (0x2482, '(15)'), + (0x246F, '16'), + (0x24F0, '16'), + (0x2497, '16.'), + (0x2483, '(16)'), + (0x2470, '17'), + (0x24F1, '17'), + (0x2498, '17.'), + (0x2484, '(17)'), + (0x2471, '18'), + (0x24F2, '18'), + (0x2499, '18.'), + (0x2485, '(18)'), + (0x2472, '19'), + (0x24F3, '19'), + (0x249A, '19.'), + (0x2486, '(19)'), + (0x2473, '20'), + (0x24F4, '20'), + (0x249B, '20.'), + (0x2487, '(20)'), + (0xAB, '"'), + (0xBB, '"'), + (0x201C, '"'), + (0x201D, '"'), + (0x201E, '"'), + (0x2033, '"'), + (0x2036, '"'), + (0x275D, '"'), + (0x275E, '"'), + (0x276E, '"'), + (0x276F, '"'), + (0xFF02, '"'), + (0x2018, '\''), + (0x2019, '\''), + (0x201A, '\''), + (0x201B, '\''), + (0x2032, '\''), + (0x2035, '\''), + (0x2039, '\''), + (0x203A, '\''), + (0x275B, '\''), + (0x275C, '\''), + (0xFF07, '\''), + (0x2010, '-'), + (0x2011, '-'), + (0x2012, '-'), + (0x2013, '-'), + (0x2014, '-'), + (0x207B, '-'), + (0x208B, '-'), + (0xFF0D, '-'), + (0x2045, '['), + (0x2772, '['), + (0xFF3B, '['), + (0x2046, ']'), + (0x2773, ']'), + (0xFF3D, ']'), + (0x207D, '('), + (0x208D, '('), + (0x2768, '('), + (0x276A, '('), + (0xFF08, '('), + (0x2E28, '(('), + (0x207E, ')'), + (0x208E, ')'), + (0x2769, ')'), + (0x276B, ')'), + (0xFF09, ')'), + (0x2E29, '))'), + (0x276C, '<'), + (0x2770, '<'), + (0xFF1C, '<'), + (0x276D, '>'), + (0x2771, '>'), + (0xFF1E, '>'), + (0x2774, '{'), + (0xFF5B, '{'), + (0x2775, '}'), + (0xFF5D, '}'), + (0x207A, '+'), + (0x208A, '+'), + (0xFF0B, '+'), + (0x207C, '='), + (0x208C, '='), + (0xFF1D, '='), + (0xFF01, '!'), + (0x203C, '!!'), + (0x2049, '!?'), + (0xFF03, '#'), + (0xFF04, '$'), + (0x2052, '%'), + (0xFF05, '%'), + (0xFF06, '&'), + (0x204E, '*'), + (0xFF0A, '*'), + (0xFF0C, ','), + (0xFF0E, '.'), + (0x2044, '/'), + (0xFF0F, '/'), + (0xFF1A, ':'), + (0x204F, ';'), + (0xFF1B, ';'), + (0xFF1F, '?'), + (0x2047, '??'), + (0x2048, '?!'), + (0xFF20, '@'), + (0xFF3C, '\\'), + (0x2038, '^'), + (0xFF3E, '^'), + (0xFF3F, '_'), + (0x2053, '~'), + (0xFF5E, '~') +] + +offsets = [] +replacements = [] +replacements_lw = [] + +for t in table: + offsets.append("0x%08X" % t[0]) + + replacement = t[1].encode("utf-8") + s = "" + for b in reversed(replacement): + s += "%02X" % b + replacements.append("0x%08X" % int(s, 16)) + + replacement_lw = t[1].lower().encode("utf-8") + s_lw = "" + for b in reversed(replacement_lw): + s_lw += "%02X" % b + replacements_lw.append("0x%08X" % int(s_lw, 16)) + + +print("static const int offsets[] = {%s};" % ",".join(offsets)) +print("static const int replacements[] = {%s};" % ",".join(replacements)) +print("static const int replacements_lw[] = {%s};" % ",".join(replacements_lw)) + diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000..7778cc3 --- /dev/null +++ b/utf8.h @@ -0,0 +1,1471 @@ +// The latest version of this library is available on GitHub; +// https://github.com/sheredom/utf8.h + +// This is free and unencumbered software released into the public domain. +// +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// For more information, please refer to + +#ifndef SHEREDOM_UTF8_H_INCLUDED +#define SHEREDOM_UTF8_H_INCLUDED + +#if defined(_MSC_VER) +#pragma warning(push) + +// disable 'bytes padding added after construct' warning +#pragma warning(disable : 4820) +#endif + +#include +#include + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +#if defined(_MSC_VER) +typedef __int32 utf8_int32_t; +#else + +#include + +typedef int32_t utf8_int32_t; +#endif + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" +#pragma clang diagnostic ignored "-Wcast-qual" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define utf8_nonnull __attribute__((nonnull)) +#define utf8_pure __attribute__((pure)) +#define utf8_restrict __restrict__ +#define utf8_weak __attribute__((weak)) +#elif defined(_MSC_VER) +#define utf8_nonnull +#define utf8_pure +#define utf8_restrict __restrict +#define utf8_weak __inline +#else +#error Non clang, non gcc, non MSVC compiler found! +#endif + +#ifdef __cplusplus +#define utf8_null NULL +#else +#define utf8_null 0 +#endif + +// Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 > +// src2 respectively, case insensitive. +utf8_nonnull utf8_pure utf8_weak int utf8casecmp(const void *src1, + const void *src2); + +// Append the utf8 string src onto the utf8 string dst. +utf8_nonnull utf8_weak void *utf8cat(void *utf8_restrict dst, + const void *utf8_restrict src); + +// Find the first match of the utf8 codepoint chr in the utf8 string src. +utf8_nonnull utf8_pure utf8_weak void *utf8chr(const void *src, + utf8_int32_t chr); + +// Return less than 0, 0, greater than 0 if src1 < src2, +// src1 == src2, src1 > src2 respectively. +utf8_nonnull utf8_pure utf8_weak int utf8cmp(const void *src1, + const void *src2); + +// Copy the utf8 string src onto the memory allocated in dst. +utf8_nonnull utf8_weak void *utf8cpy(void *utf8_restrict dst, + const void *utf8_restrict src); + +// Number of utf8 codepoints in the utf8 string src that consists entirely +// of utf8 codepoints not from the utf8 string reject. +utf8_nonnull utf8_pure utf8_weak size_t utf8cspn(const void *src, + const void *reject); + +// Duplicate the utf8 string src by getting its size, malloc'ing a new buffer +// copying over the data, and returning that. Or 0 if malloc failed. +utf8_nonnull utf8_weak void *utf8dup(const void *src); + +// Number of utf8 codepoints in the utf8 string str, +// excluding the null terminating byte. +utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str); + +// Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 > +// src2 respectively, case insensitive. Checking at most n bytes of each utf8 +// string. +utf8_nonnull utf8_pure utf8_weak int utf8ncasecmp(const void *src1, + const void *src2, size_t n); + +// Append the utf8 string src onto the utf8 string dst, +// writing at most n+1 bytes. Can produce an invalid utf8 +// string if n falls partway through a utf8 codepoint. +utf8_nonnull utf8_weak void *utf8ncat(void *utf8_restrict dst, + const void *utf8_restrict src, size_t n); + +// Return less than 0, 0, greater than 0 if src1 < src2, +// src1 == src2, src1 > src2 respectively. Checking at most n +// bytes of each utf8 string. +utf8_nonnull utf8_pure utf8_weak int utf8ncmp(const void *src1, + const void *src2, size_t n); + +// Copy the utf8 string src onto the memory allocated in dst. +// Copies at most n bytes. If there is no terminating null byte in +// the first n bytes of src, the string placed into dst will not be +// null-terminated. If the size (in bytes) of src is less than n, +// extra null terminating bytes are appended to dst such that at +// total of n bytes are written. Can produce an invalid utf8 +// string if n falls partway through a utf8 codepoint. +utf8_nonnull utf8_weak void *utf8ncpy(void *utf8_restrict dst, + const void *utf8_restrict src, size_t n); + +// Similar to utf8dup, except that at most n bytes of src are copied. If src is +// longer than n, only n bytes are copied and a null byte is added. +// +// Returns a new string if successful, 0 otherwise +utf8_nonnull utf8_weak void *utf8ndup(const void *src, size_t n); + +// Locates the first occurence in the utf8 string str of any byte in the +// utf8 string accept, or 0 if no match was found. +utf8_nonnull utf8_pure utf8_weak void *utf8pbrk(const void *str, + const void *accept); + +// Find the last match of the utf8 codepoint chr in the utf8 string src. +utf8_nonnull utf8_pure utf8_weak void *utf8rchr(const void *src, int chr); + +// Number of bytes in the utf8 string str, +// including the null terminating byte. +utf8_nonnull utf8_pure utf8_weak size_t utf8size(const void *str); + +// Number of utf8 codepoints in the utf8 string src that consists entirely +// of utf8 codepoints from the utf8 string accept. +utf8_nonnull utf8_pure utf8_weak size_t utf8spn(const void *src, + const void *accept); + +// The position of the utf8 string needle in the utf8 string haystack. +utf8_nonnull utf8_pure utf8_weak void *utf8str(const void *haystack, + const void *needle); + +// The position of the utf8 string needle in the utf8 string haystack, case +// insensitive. +utf8_nonnull utf8_pure utf8_weak void *utf8casestr(const void *haystack, + const void *needle); + +// Return 0 on success, or the position of the invalid +// utf8 codepoint on failure. +utf8_nonnull utf8_pure utf8_weak void *utf8valid(const void *str); + +// Sets out_codepoint to the next utf8 codepoint in str, and returns the address +// of the utf8 codepoint after the current one in str. +utf8_nonnull utf8_weak void * +utf8codepoint(const void *utf8_restrict str, + utf8_int32_t *utf8_restrict out_codepoint); + +// Calculates the size of the next utf8 codepoint in str. +utf8_nonnull utf8_weak size_t utf8codepointcalcsize(const void *utf8_restrict str); + +// Returns the size of the given codepoint in bytes. +utf8_weak size_t utf8codepointsize(utf8_int32_t chr); + +// Write a codepoint to the given string, and return the address to the next +// place after the written codepoint. Pass how many bytes left in the buffer to +// n. If there is not enough space for the codepoint, this function returns +// null. +utf8_nonnull utf8_weak void *utf8catcodepoint(void *utf8_restrict str, + utf8_int32_t chr, size_t n); + +// Returns 1 if the given character is lowercase, or 0 if it is not. +utf8_weak int utf8islower(utf8_int32_t chr); + +// Returns 1 if the given character is uppercase, or 0 if it is not. +utf8_weak int utf8isupper(utf8_int32_t chr); + +// Transform the given string into all lowercase codepoints. +utf8_nonnull utf8_weak void utf8lwr(void *utf8_restrict str); + +// Transform the given string into all uppercase codepoints. +utf8_nonnull utf8_weak void utf8upr(void *utf8_restrict str); + +// Make a codepoint lower case if possible. +utf8_weak utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp); + +// Make a codepoint upper case if possible. +utf8_weak utf8_int32_t utf8uprcodepoint(utf8_int32_t cp); + +#undef utf8_weak +#undef utf8_pure +#undef utf8_nonnull + +int utf8casecmp(const void *src1, const void *src2) { + utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp; + + for (;;) { + src1 = utf8codepoint(src1, &src1_cp); + src2 = utf8codepoint(src2, &src2_cp); + + // take a copy of src1 & src2 + src1_orig_cp = src1_cp; + src2_orig_cp = src2_cp; + + // lower the srcs if required + src1_cp = utf8lwrcodepoint(src1_cp); + src2_cp = utf8lwrcodepoint(src2_cp); + + // check if the lowered codepoints match + if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) { + return 0; + } else if (src1_cp == src2_cp) { + continue; + } + + // if they don't match, then we return the difference between the characters + return src1_cp - src2_cp; + } +} + +void *utf8cat(void *utf8_restrict dst, const void *utf8_restrict src) { + char *d = (char *) dst; + const char *s = (const char *) src; + + // find the null terminating byte in dst + while ('\0' != *d) { + d++; + } + + // overwriting the null terminating byte in dst, append src byte-by-byte + while ('\0' != *s) { + *d++ = *s++; + } + + // write out a new null terminating byte into dst + *d = '\0'; + + return dst; +} + +void *utf8chr(const void *src, utf8_int32_t chr) { + char c[5] = {'\0', '\0', '\0', '\0', '\0'}; + + if (0 == chr) { + // being asked to return position of null terminating byte, so + // just run s to the end, and return! + const char *s = (const char *) src; + while ('\0' != *s) { + s++; + } + return (void *) s; + } else if (0 == ((utf8_int32_t) 0xffffff80 & chr)) { + // 1-byte/7-bit ascii + // (0b0xxxxxxx) + c[0] = (char) chr; + } else if (0 == ((utf8_int32_t) 0xfffff800 & chr)) { + // 2-byte/11-bit utf8 code point + // (0b110xxxxx 0b10xxxxxx) + c[0] = 0xc0 | (char) (chr >> 6); + c[1] = 0x80 | (char) (chr & 0x3f); + } else if (0 == ((utf8_int32_t) 0xffff0000 & chr)) { + // 3-byte/16-bit utf8 code point + // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) + c[0] = 0xe0 | (char) (chr >> 12); + c[1] = 0x80 | (char) ((chr >> 6) & 0x3f); + c[2] = 0x80 | (char) (chr & 0x3f); + } else { // if (0 == ((int)0xffe00000 & chr)) { + // 4-byte/21-bit utf8 code point + // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) + c[0] = 0xf0 | (char) (chr >> 18); + c[1] = 0x80 | (char) ((chr >> 12) & 0x3f); + c[2] = 0x80 | (char) ((chr >> 6) & 0x3f); + c[3] = 0x80 | (char) (chr & 0x3f); + } + + // we've made c into a 2 utf8 codepoint string, one for the chr we are + // seeking, another for the null terminating byte. Now use utf8str to + // search + return utf8str(src, c); +} + +int utf8cmp(const void *src1, const void *src2) { + const unsigned char *s1 = (const unsigned char *) src1; + const unsigned char *s2 = (const unsigned char *) src2; + + while (('\0' != *s1) || ('\0' != *s2)) { + if (*s1 < *s2) { + return -1; + } else if (*s1 > *s2) { + return 1; + } + + s1++; + s2++; + } + + // both utf8 strings matched + return 0; +} + +int utf8coll(const void *src1, const void *src2); + +void *utf8cpy(void *utf8_restrict dst, const void *utf8_restrict src) { + char *d = (char *) dst; + const char *s = (const char *) src; + + // overwriting anything previously in dst, write byte-by-byte + // from src + while ('\0' != *s) { + *d++ = *s++; + } + + // append null terminating byte + *d = '\0'; + + return dst; +} + +size_t utf8cspn(const void *src, const void *reject) { + const char *s = (const char *) src; + size_t chars = 0; + + while ('\0' != *s) { + const char *r = (const char *) reject; + size_t offset = 0; + + while ('\0' != *r) { + // checking that if *r is the start of a utf8 codepoint + // (it is not 0b10xxxxxx) and we have successfully matched + // a previous character (0 < offset) - we found a match + if ((0x80 != (0xc0 & *r)) && (0 < offset)) { + return chars; + } else { + if (*r == s[offset]) { + // part of a utf8 codepoint matched, so move our checking + // onwards to the next byte + offset++; + r++; + } else { + // r could be in the middle of an unmatching utf8 code point, + // so we need to march it on to the next character beginning, + + do { + r++; + } while (0x80 == (0xc0 & *r)); + + // reset offset too as we found a mismatch + offset = 0; + } + } + } + + // found a match at the end of *r, so didn't get a chance to test it + if (0 < offset) { + return chars; + } + + // the current utf8 codepoint in src did not match reject, but src + // could have been partway through a utf8 codepoint, so we need to + // march it onto the next utf8 codepoint starting byte + do { + s++; + } while ((0x80 == (0xc0 & *s))); + chars++; + } + + return chars; +} + +size_t utf8size(const void *str); + +void *utf8dup(const void *src) { + const char *s = (const char *) src; + char *n = utf8_null; + + // figure out how many bytes (including the terminator) we need to copy first + size_t bytes = utf8size(src); + + n = (char *) malloc(bytes); + + if (utf8_null == n) { + // out of memory so we bail + return utf8_null; + } else { + bytes = 0; + + // copy src byte-by-byte into our new utf8 string + while ('\0' != s[bytes]) { + n[bytes] = s[bytes]; + bytes++; + } + + // append null terminating byte + n[bytes] = '\0'; + return n; + } +} + +void *utf8fry(const void *str); + +size_t utf8len(const void *str) { + const unsigned char *s = (const unsigned char *) str; + size_t length = 0; + + while ('\0' != *s) { + if (0xf0 == (0xf8 & *s)) { + // 4-byte utf8 code point (began with 0b11110xxx) + s += 4; + } else if (0xe0 == (0xf0 & *s)) { + // 3-byte utf8 code point (began with 0b1110xxxx) + s += 3; + } else if (0xc0 == (0xe0 & *s)) { + // 2-byte utf8 code point (began with 0b110xxxxx) + s += 2; + } else { // if (0x00 == (0x80 & *s)) { + // 1-byte ascii (began with 0b0xxxxxxx) + s += 1; + } + + // no matter the bytes we marched s forward by, it was + // only 1 utf8 codepoint + length++; + } + + return length; +} + +int utf8ncasecmp(const void *src1, const void *src2, size_t n) { + utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp; + + do { + const unsigned char *const s1 = (const unsigned char *) src1; + const unsigned char *const s2 = (const unsigned char *) src2; + + // first check that we have enough bytes left in n to contain an entire + // codepoint + if (0 == n) { + return 0; + } + + if ((1 == n) && ((0xc0 == (0xe0 & *s1)) || (0xc0 == (0xe0 & *s2)))) { + const utf8_int32_t c1 = (0xe0 & *s1); + const utf8_int32_t c2 = (0xe0 & *s2); + + if (c1 < c2) { + return -1; + } else if (c1 > c2) { + return 1; + } else { + return 0; + } + } + + if ((2 >= n) && ((0xe0 == (0xf0 & *s1)) || (0xe0 == (0xf0 & *s2)))) { + const utf8_int32_t c1 = (0xf0 & *s1); + const utf8_int32_t c2 = (0xf0 & *s2); + + if (c1 < c2) { + return -1; + } else if (c1 > c2) { + return 1; + } else { + return 0; + } + } + + if ((3 >= n) && ((0xf0 == (0xf8 & *s1)) || (0xf0 == (0xf8 & *s2)))) { + const utf8_int32_t c1 = (0xf8 & *s1); + const utf8_int32_t c2 = (0xf8 & *s2); + + if (c1 < c2) { + return -1; + } else if (c1 > c2) { + return 1; + } else { + return 0; + } + } + + src1 = utf8codepoint(src1, &src1_cp); + src2 = utf8codepoint(src2, &src2_cp); + n -= utf8codepointsize(src1_cp); + + // Take a copy of src1 & src2 + src1_orig_cp = src1_cp; + src2_orig_cp = src2_cp; + + // Lower srcs if required + src1_cp = utf8lwrcodepoint(src1_cp); + src2_cp = utf8lwrcodepoint(src2_cp); + + // Check if the lowered codepoints match + if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) { + return 0; + } else if (src1_cp == src2_cp) { + continue; + } + + // If they don't match, then we return which of the original's are less + if (src1_orig_cp < src2_orig_cp) { + return -1; + } else if (src1_orig_cp > src2_orig_cp) { + return 1; + } + } while (0 < n); + + // both utf8 strings matched + return 0; +} + +void *utf8ncat(void *utf8_restrict dst, const void *utf8_restrict src, + size_t n) { + char *d = (char *) dst; + const char *s = (const char *) src; + + // find the null terminating byte in dst + while ('\0' != *d) { + d++; + } + + // overwriting the null terminating byte in dst, append src byte-by-byte + // stopping if we run out of space + do { + *d++ = *s++; + } while (('\0' != *s) && (0 != --n)); + + // write out a new null terminating byte into dst + *d = '\0'; + + return dst; +} + +int utf8ncmp(const void *src1, const void *src2, size_t n) { + const unsigned char *s1 = (const unsigned char *) src1; + const unsigned char *s2 = (const unsigned char *) src2; + + while ((0 != n--) && (('\0' != *s1) || ('\0' != *s2))) { + if (*s1 < *s2) { + return -1; + } else if (*s1 > *s2) { + return 1; + } + + s1++; + s2++; + } + + // both utf8 strings matched + return 0; +} + +void *utf8ncpy(void *utf8_restrict dst, const void *utf8_restrict src, + size_t n) { + char *d = (char *) dst; + const char *s = (const char *) src; + size_t index; + + // overwriting anything previously in dst, write byte-by-byte + // from src + for (index = 0; index < n; index++) { + d[index] = s[index]; + if ('\0' == s[index]) { + break; + } + } + + // append null terminating byte + for (; index < n; index++) { + d[index] = 0; + } + + return dst; +} + +void *utf8ndup(const void *src, size_t n) { + const char *s = (const char *) src; + char *c = utf8_null; + size_t bytes = 0; + + // Find the end of the string or stop when n is reached + while ('\0' != s[bytes] && bytes < n) { + bytes++; + } + + // In case bytes is actually less than n, we need to set it + // to be used later in the copy byte by byte. + n = bytes; + + c = (char *) malloc(bytes + 1); + if (utf8_null == c) { + // out of memory so we bail + return utf8_null; + } + + bytes = 0; + + // copy src byte-by-byte into our new utf8 string + while ('\0' != s[bytes] && bytes < n) { + c[bytes] = s[bytes]; + bytes++; + } + + // append null terminating byte + c[bytes] = '\0'; + return c; +} + +void *utf8rchr(const void *src, int chr) { + const char *s = (const char *) src; + const char *match = utf8_null; + char c[5] = {'\0', '\0', '\0', '\0', '\0'}; + + if (0 == chr) { + // being asked to return position of null terminating byte, so + // just run s to the end, and return! + while ('\0' != *s) { + s++; + } + return (void *) s; + } else if (0 == ((int) 0xffffff80 & chr)) { + // 1-byte/7-bit ascii + // (0b0xxxxxxx) + c[0] = (char) chr; + } else if (0 == ((int) 0xfffff800 & chr)) { + // 2-byte/11-bit utf8 code point + // (0b110xxxxx 0b10xxxxxx) + c[0] = 0xc0 | (char) (chr >> 6); + c[1] = 0x80 | (char) (chr & 0x3f); + } else if (0 == ((int) 0xffff0000 & chr)) { + // 3-byte/16-bit utf8 code point + // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) + c[0] = 0xe0 | (char) (chr >> 12); + c[1] = 0x80 | (char) ((chr >> 6) & 0x3f); + c[2] = 0x80 | (char) (chr & 0x3f); + } else { // if (0 == ((int)0xffe00000 & chr)) { + // 4-byte/21-bit utf8 code point + // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) + c[0] = 0xf0 | (char) (chr >> 18); + c[1] = 0x80 | (char) ((chr >> 12) & 0x3f); + c[2] = 0x80 | (char) ((chr >> 6) & 0x3f); + c[3] = 0x80 | (char) (chr & 0x3f); + } + + // we've created a 2 utf8 codepoint string in c that is + // the utf8 character asked for by chr, and a null + // terminating byte + + while ('\0' != *s) { + size_t offset = 0; + + while (s[offset] == c[offset]) { + offset++; + } + + if ('\0' == c[offset]) { + // we found a matching utf8 code point + match = s; + s += offset; + } else { + s += offset; + + // need to march s along to next utf8 codepoint start + // (the next byte that doesn't match 0b10xxxxxx) + if ('\0' != *s) { + do { + s++; + } while (0x80 == (0xc0 & *s)); + } + } + } + + // return the last match we found (or 0 if no match was found) + return (void *) match; +} + +void *utf8pbrk(const void *str, const void *accept) { + const char *s = (const char *) str; + + while ('\0' != *s) { + const char *a = (const char *) accept; + size_t offset = 0; + + while ('\0' != *a) { + // checking that if *a is the start of a utf8 codepoint + // (it is not 0b10xxxxxx) and we have successfully matched + // a previous character (0 < offset) - we found a match + if ((0x80 != (0xc0 & *a)) && (0 < offset)) { + return (void *) s; + } else { + if (*a == s[offset]) { + // part of a utf8 codepoint matched, so move our checking + // onwards to the next byte + offset++; + a++; + } else { + // r could be in the middle of an unmatching utf8 code point, + // so we need to march it on to the next character beginning, + + do { + a++; + } while (0x80 == (0xc0 & *a)); + + // reset offset too as we found a mismatch + offset = 0; + } + } + } + + // we found a match on the last utf8 codepoint + if (0 < offset) { + return (void *) s; + } + + // the current utf8 codepoint in src did not match accept, but src + // could have been partway through a utf8 codepoint, so we need to + // march it onto the next utf8 codepoint starting byte + do { + s++; + } while ((0x80 == (0xc0 & *s))); + } + + return utf8_null; +} + +size_t utf8size(const void *str) { + const char *s = (const char *) str; + size_t size = 0; + while ('\0' != s[size]) { + size++; + } + + // we are including the null terminating byte in the size calculation + size++; + return size; +} + +size_t utf8spn(const void *src, const void *accept) { + const char *s = (const char *) src; + size_t chars = 0; + + while ('\0' != *s) { + const char *a = (const char *) accept; + size_t offset = 0; + + while ('\0' != *a) { + // checking that if *r is the start of a utf8 codepoint + // (it is not 0b10xxxxxx) and we have successfully matched + // a previous character (0 < offset) - we found a match + if ((0x80 != (0xc0 & *a)) && (0 < offset)) { + // found a match, so increment the number of utf8 codepoints + // that have matched and stop checking whether any other utf8 + // codepoints in a match + chars++; + s += offset; + offset = 0; + break; + } else { + if (*a == s[offset]) { + offset++; + a++; + } else { + // a could be in the middle of an unmatching utf8 codepoint, + // so we need to march it on to the next character beginning, + do { + a++; + } while (0x80 == (0xc0 & *a)); + + // reset offset too as we found a mismatch + offset = 0; + } + } + } + + // found a match at the end of *a, so didn't get a chance to test it + if (0 < offset) { + chars++; + s += offset; + continue; + } + + // if a got to its terminating null byte, then we didn't find a match. + // Return the current number of matched utf8 codepoints + if ('\0' == *a) { + return chars; + } + } + + return chars; +} + +void *utf8str(const void *haystack, const void *needle) { + const char *h = (const char *) haystack; + utf8_int32_t throwaway_codepoint; + + // if needle has no utf8 codepoints before the null terminating + // byte then return haystack + if ('\0' == *((const char *) needle)) { + return (void *) haystack; + } + + while ('\0' != *h) { + const char *maybeMatch = h; + const char *n = (const char *) needle; + + while (*h == *n && (*h != '\0' && *n != '\0')) { + n++; + h++; + } + + if ('\0' == *n) { + // we found the whole utf8 string for needle in haystack at + // maybeMatch, so return it + return (void *) maybeMatch; + } else { + // h could be in the middle of an unmatching utf8 codepoint, + // so we need to march it on to the next character beginning + // starting from the current character + h = (const char *) utf8codepoint(maybeMatch, &throwaway_codepoint); + } + } + + // no match + return utf8_null; +} + +void *utf8casestr(const void *haystack, const void *needle) { + const void *h = haystack; + + // if needle has no utf8 codepoints before the null terminating + // byte then return haystack + if ('\0' == *((const char *) needle)) { + return (void *) haystack; + } + + for (;;) { + const void *maybeMatch = h; + const void *n = needle; + utf8_int32_t h_cp, n_cp; + + // Get the next code point and track it + const void *nextH = h = utf8codepoint(h, &h_cp); + n = utf8codepoint(n, &n_cp); + + while ((0 != h_cp) && (0 != n_cp)) { + h_cp = utf8lwrcodepoint(h_cp); + n_cp = utf8lwrcodepoint(n_cp); + + // if we find a mismatch, bail out! + if (h_cp != n_cp) { + break; + } + + h = utf8codepoint(h, &h_cp); + n = utf8codepoint(n, &n_cp); + } + + if (0 == n_cp) { + // we found the whole utf8 string for needle in haystack at + // maybeMatch, so return it + return (void *) maybeMatch; + } + + if (0 == h_cp) { + // no match + return utf8_null; + } + + // Roll back to the next code point in the haystack to test + h = nextH; + } +} + +void *utf8valid(const void *str) { + const char *s = (const char *) str; + + while ('\0' != *s) { + if (0xf0 == (0xf8 & *s)) { + // ensure each of the 3 following bytes in this 4-byte + // utf8 codepoint began with 0b10xxxxxx + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) || + (0x80 != (0xc0 & s[3]))) { + return (void *) s; + } + + // ensure that our utf8 codepoint ended after 4 bytes + if (0x80 == (0xc0 & s[4])) { + return (void *) s; + } + + // ensure that the top 5 bits of this 4-byte utf8 + // codepoint were not 0, as then we could have used + // one of the smaller encodings + if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) { + return (void *) s; + } + + // 4-byte utf8 code point (began with 0b11110xxx) + s += 4; + } else if (0xe0 == (0xf0 & *s)) { + // ensure each of the 2 following bytes in this 3-byte + // utf8 codepoint began with 0b10xxxxxx + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) { + return (void *) s; + } + + // ensure that our utf8 codepoint ended after 3 bytes + if (0x80 == (0xc0 & s[3])) { + return (void *) s; + } + + // ensure that the top 5 bits of this 3-byte utf8 + // codepoint were not 0, as then we could have used + // one of the smaller encodings + if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) { + return (void *) s; + } + + // 3-byte utf8 code point (began with 0b1110xxxx) + s += 3; + } else if (0xc0 == (0xe0 & *s)) { + // ensure the 1 following byte in this 2-byte + // utf8 codepoint began with 0b10xxxxxx + if (0x80 != (0xc0 & s[1])) { + return (void *) s; + } + + // ensure that our utf8 codepoint ended after 2 bytes + if (0x80 == (0xc0 & s[2])) { + return (void *) s; + } + + // ensure that the top 4 bits of this 2-byte utf8 + // codepoint were not 0, as then we could have used + // one of the smaller encodings + if (0 == (0x1e & s[0])) { + return (void *) s; + } + + // 2-byte utf8 code point (began with 0b110xxxxx) + s += 2; + } else if (0x00 == (0x80 & *s)) { + // 1-byte ascii (began with 0b0xxxxxxx) + s += 1; + } else { + // we have an invalid 0b1xxxxxxx utf8 code point entry + return (void *) s; + } + } + + return utf8_null; +} + +void *utf8codepoint(const void *utf8_restrict str, + utf8_int32_t *utf8_restrict out_codepoint) { + const char *s = (const char *) str; + + if (0xf0 == (0xf8 & s[0])) { + // 4 byte utf8 codepoint + *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | + ((0x3f & s[2]) << 6) | (0x3f & s[3]); + s += 4; + } else if (0xe0 == (0xf0 & s[0])) { + // 3 byte utf8 codepoint + *out_codepoint = + ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]); + s += 3; + } else if (0xc0 == (0xe0 & s[0])) { + // 2 byte utf8 codepoint + *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]); + s += 2; + } else { + // 1 byte utf8 codepoint otherwise + *out_codepoint = s[0]; + s += 1; + } + + return (void *) s; +} + +size_t utf8codepointcalcsize(const void *utf8_restrict str) { + const char *s = (const char *) str; + + if (0xf0 == (0xf8 & s[0])) { + // 4 byte utf8 codepoint + return 4; + } else if (0xe0 == (0xf0 & s[0])) { + // 3 byte utf8 codepoint + return 3; + } else if (0xc0 == (0xe0 & s[0])) { + // 2 byte utf8 codepoint + return 2; + } + + // 1 byte utf8 codepoint otherwise + return 1; +} + +size_t utf8codepointsize(utf8_int32_t chr) { + if (0 == ((utf8_int32_t) 0xffffff80 & chr)) { + return 1; + } else if (0 == ((utf8_int32_t) 0xfffff800 & chr)) { + return 2; + } else if (0 == ((utf8_int32_t) 0xffff0000 & chr)) { + return 3; + } else { // if (0 == ((int)0xffe00000 & chr)) { + return 4; + } +} + +void *utf8catcodepoint(void *utf8_restrict str, utf8_int32_t chr, size_t n) { + char *s = (char *) str; + + if (0 == ((utf8_int32_t) 0xffffff80 & chr)) { + // 1-byte/7-bit ascii + // (0b0xxxxxxx) + if (n < 1) { + return utf8_null; + } + s[0] = (char) chr; + s += 1; + } else if (0 == ((utf8_int32_t) 0xfffff800 & chr)) { + // 2-byte/11-bit utf8 code point + // (0b110xxxxx 0b10xxxxxx) + if (n < 2) { + return utf8_null; + } + s[0] = 0xc0 | (char) (chr >> 6); + s[1] = 0x80 | (char) (chr & 0x3f); + s += 2; + } else if (0 == ((utf8_int32_t) 0xffff0000 & chr)) { + // 3-byte/16-bit utf8 code point + // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) + if (n < 3) { + return utf8_null; + } + s[0] = 0xe0 | (char) (chr >> 12); + s[1] = 0x80 | (char) ((chr >> 6) & 0x3f); + s[2] = 0x80 | (char) (chr & 0x3f); + s += 3; + } else { // if (0 == ((int)0xffe00000 & chr)) { + // 4-byte/21-bit utf8 code point + // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) + if (n < 4) { + return utf8_null; + } + s[0] = 0xf0 | (char) (chr >> 18); + s[1] = 0x80 | (char) ((chr >> 12) & 0x3f); + s[2] = 0x80 | (char) ((chr >> 6) & 0x3f); + s[3] = 0x80 | (char) (chr & 0x3f); + s += 4; + } + + return s; +} + +int utf8islower(utf8_int32_t chr) { return chr != utf8uprcodepoint(chr); } + +int utf8isupper(utf8_int32_t chr) { return chr != utf8lwrcodepoint(chr); } + +void utf8lwr(void *utf8_restrict str) { + void *p, *pn; + utf8_int32_t cp; + + p = (char *) str; + pn = utf8codepoint(p, &cp); + + while (cp != 0) { + const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp); + const size_t size = utf8codepointsize(lwr_cp); + + if (lwr_cp != cp) { + utf8catcodepoint(p, lwr_cp, size); + } + + p = pn; + pn = utf8codepoint(p, &cp); + } +} + +void utf8upr(void *utf8_restrict str) { + void *p, *pn; + utf8_int32_t cp; + + p = (char *) str; + pn = utf8codepoint(p, &cp); + + while (cp != 0) { + const utf8_int32_t lwr_cp = utf8uprcodepoint(cp); + const size_t size = utf8codepointsize(lwr_cp); + + if (lwr_cp != cp) { + utf8catcodepoint(p, lwr_cp, size); + } + + p = pn; + pn = utf8codepoint(p, &cp); + } +} + +utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) { + if (((0x0041 <= cp) && (0x005a >= cp)) || + ((0x00c0 <= cp) && (0x00d6 >= cp)) || + ((0x00d8 <= cp) && (0x00de >= cp)) || + ((0x0391 <= cp) && (0x03a1 >= cp)) || + ((0x03a3 <= cp) && (0x03ab >= cp))) { + cp += 32; + } else if (((0x0100 <= cp) && (0x012f >= cp)) || + ((0x0132 <= cp) && (0x0137 >= cp)) || + ((0x014a <= cp) && (0x0177 >= cp)) || + ((0x0182 <= cp) && (0x0185 >= cp)) || + ((0x01a0 <= cp) && (0x01a5 >= cp)) || + ((0x01de <= cp) && (0x01ef >= cp)) || + ((0x01f8 <= cp) && (0x021f >= cp)) || + ((0x0222 <= cp) && (0x0233 >= cp)) || + ((0x0246 <= cp) && (0x024f >= cp)) || + ((0x03d8 <= cp) && (0x03ef >= cp))) { + cp |= 0x1; + } else if (((0x0139 <= cp) && (0x0148 >= cp)) || + ((0x0179 <= cp) && (0x017e >= cp)) || + ((0x01af <= cp) && (0x01b0 >= cp)) || + ((0x01b3 <= cp) && (0x01b6 >= cp)) || + ((0x01cd <= cp) && (0x01dc >= cp))) { + cp += 1; + cp &= ~0x1; + } else if ((cp >= 0x400 && cp <= 0x40f)) { + cp += 0x50; + } else if ((cp >= 0x410 && cp <= 0x41f) || + (cp >= 0x420 && cp <= 0x42f)) { + cp += 0x20; + } else if (((cp >= 0x460) && (cp <= 0x4ff))) { + cp += 1; + } else { + switch (cp) { + default: + break; + case 0x0178: + cp = 0x00ff; + break; + case 0x0243: + cp = 0x0180; + break; + case 0x018e: + cp = 0x01dd; + break; + case 0x023d: + cp = 0x019a; + break; + case 0x0220: + cp = 0x019e; + break; + case 0x01b7: + cp = 0x0292; + break; + case 0x01c4: + cp = 0x01c6; + break; + case 0x01c7: + cp = 0x01c9; + break; + case 0x01ca: + cp = 0x01cc; + break; + case 0x01f1: + cp = 0x01f3; + break; + case 0x01f7: + cp = 0x01bf; + break; + case 0x0187: + cp = 0x0188; + break; + case 0x018b: + cp = 0x018c; + break; + case 0x0191: + cp = 0x0192; + break; + case 0x0198: + cp = 0x0199; + break; + case 0x01a7: + cp = 0x01a8; + break; + case 0x01ac: + cp = 0x01ad; + break; + case 0x01af: + cp = 0x01b0; + break; + case 0x01b8: + cp = 0x01b9; + break; + case 0x01bc: + cp = 0x01bd; + break; + case 0x01f4: + cp = 0x01f5; + break; + case 0x023b: + cp = 0x023c; + break; + case 0x0241: + cp = 0x0242; + break; + case 0x03fd: + cp = 0x037b; + break; + case 0x03fe: + cp = 0x037c; + break; + case 0x03ff: + cp = 0x037d; + break; + case 0x037f: + cp = 0x03f3; + break; + case 0x0386: + cp = 0x03ac; + break; + case 0x0388: + cp = 0x03ad; + break; + case 0x0389: + cp = 0x03ae; + break; + case 0x038a: + cp = 0x03af; + break; + case 0x038c: + cp = 0x03cc; + break; + case 0x038e: + cp = 0x03cd; + break; + case 0x038f: + cp = 0x03ce; + break; + case 0x0370: + cp = 0x0371; + break; + case 0x0372: + cp = 0x0373; + break; + case 0x0376: + cp = 0x0377; + break; + case 0x03f4: + cp = 0x03d1; + break; + case 0x03cf: + cp = 0x03d7; + break; + case 0x03f9: + cp = 0x03f2; + break; + case 0x03f7: + cp = 0x03f8; + break; + case 0x03fa: + cp = 0x03fb; + break; + }; + } + + return cp; +} + +utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) { + if (((0x0061 <= cp) && (0x007a >= cp)) || + ((0x00e0 <= cp) && (0x00f6 >= cp)) || + ((0x00f8 <= cp) && (0x00fe >= cp)) || + ((0x03b1 <= cp) && (0x03c1 >= cp)) || + ((0x03c3 <= cp) && (0x03cb >= cp))) { + cp -= 32; + } else if (((0x0100 <= cp) && (0x012f >= cp)) || + ((0x0132 <= cp) && (0x0137 >= cp)) || + ((0x014a <= cp) && (0x0177 >= cp)) || + ((0x0182 <= cp) && (0x0185 >= cp)) || + ((0x01a0 <= cp) && (0x01a5 >= cp)) || + ((0x01de <= cp) && (0x01ef >= cp)) || + ((0x01f8 <= cp) && (0x021f >= cp)) || + ((0x0222 <= cp) && (0x0233 >= cp)) || + ((0x0246 <= cp) && (0x024f >= cp)) || + ((0x03d8 <= cp) && (0x03ef >= cp))) { + cp &= ~0x1; + } else if (((0x0139 <= cp) && (0x0148 >= cp)) || + ((0x0179 <= cp) && (0x017e >= cp)) || + ((0x01af <= cp) && (0x01b0 >= cp)) || + ((0x01b3 <= cp) && (0x01b6 >= cp)) || + ((0x01cd <= cp) && (0x01dc >= cp))) { + cp -= 1; + cp |= 0x1; + } else { + switch (cp) { + default: + break; + case 0x00ff: + cp = 0x0178; + break; + case 0x0180: + cp = 0x0243; + break; + case 0x01dd: + cp = 0x018e; + break; + case 0x019a: + cp = 0x023d; + break; + case 0x019e: + cp = 0x0220; + break; + case 0x0292: + cp = 0x01b7; + break; + case 0x01c6: + cp = 0x01c4; + break; + case 0x01c9: + cp = 0x01c7; + break; + case 0x01cc: + cp = 0x01ca; + break; + case 0x01f3: + cp = 0x01f1; + break; + case 0x01bf: + cp = 0x01f7; + break; + case 0x0188: + cp = 0x0187; + break; + case 0x018c: + cp = 0x018b; + break; + case 0x0192: + cp = 0x0191; + break; + case 0x0199: + cp = 0x0198; + break; + case 0x01a8: + cp = 0x01a7; + break; + case 0x01ad: + cp = 0x01ac; + break; + case 0x01b0: + cp = 0x01af; + break; + case 0x01b9: + cp = 0x01b8; + break; + case 0x01bd: + cp = 0x01bc; + break; + case 0x01f5: + cp = 0x01f4; + break; + case 0x023c: + cp = 0x023b; + break; + case 0x0242: + cp = 0x0241; + break; + case 0x037b: + cp = 0x03fd; + break; + case 0x037c: + cp = 0x03fe; + break; + case 0x037d: + cp = 0x03ff; + break; + case 0x03f3: + cp = 0x037f; + break; + case 0x03ac: + cp = 0x0386; + break; + case 0x03ad: + cp = 0x0388; + break; + case 0x03ae: + cp = 0x0389; + break; + case 0x03af: + cp = 0x038a; + break; + case 0x03cc: + cp = 0x038c; + break; + case 0x03cd: + cp = 0x038e; + break; + case 0x03ce: + cp = 0x038f; + break; + case 0x0371: + cp = 0x0370; + break; + case 0x0373: + cp = 0x0372; + break; + case 0x0377: + cp = 0x0376; + break; + case 0x03d1: + cp = 0x03f4; + break; + case 0x03d7: + cp = 0x03cf; + break; + case 0x03f2: + cp = 0x03f9; + break; + case 0x03f8: + cp = 0x03f7; + break; + case 0x03fb: + cp = 0x03fa; + break; + }; + } + + return cp; +} + +#undef utf8_restrict +#undef utf8_null + +#ifdef __cplusplus +} // extern "C" +#endif + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#endif // SHEREDOM_UTF8_H_INCLUDED