| /*************************************************** |
| * A program for testing the Unicode property table * |
| ***************************************************/ |
| |
| /* Copyright (c) University of Cambridge 2008 - 2018 */ |
| |
| /* Compile thus: |
| gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \ |
| ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c |
| */ |
| |
| /* The program expects to read commands on stdin, and it writes output |
| to stdout. There is only one command, "findprop", followed by a list of Unicode |
| code points as hex numbers (without any prefixes). The output is one line per |
| character, giving its Unicode properties followed by its other case if there is |
| one. */ |
| |
| #ifdef HAVE_CONFIG_H |
| #include "../src/config.h" |
| #endif |
| |
| #ifndef SUPPORT_UNICODE |
| #define SUPPORT_UNICODE |
| #endif |
| |
| #include <ctype.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include "../src/pcre2_internal.h" |
| #include "../src/pcre2_ucp.h" |
| |
| |
| |
| /* -------------------------------------------------------------------*/ |
| |
| #define CS (char *) |
| #define CCS (const char *) |
| #define CSS (char **) |
| #define US (unsigned char *) |
| #define CUS (const unsigned char *) |
| #define USS (unsigned char **) |
| |
| /* -------------------------------------------------------------------*/ |
| |
| |
| |
| |
| /************************************************* |
| * Print Unicode property info for a char * |
| *************************************************/ |
| |
| static void |
| print_prop(int c) |
| { |
| int type = UCD_CATEGORY(c); |
| int fulltype = UCD_CHARTYPE(c); |
| int script = UCD_SCRIPT(c); |
| int gbprop = UCD_GRAPHBREAK(c); |
| int othercase = UCD_OTHERCASE(c); |
| int caseset = UCD_CASESET(c); |
| |
| unsigned char *fulltypename = US"??"; |
| unsigned char *typename = US"??"; |
| unsigned char *scriptname = US"??"; |
| unsigned char *graphbreak = US"??"; |
| |
| switch (type) |
| { |
| case ucp_C: typename = US"Control"; break; |
| case ucp_L: typename = US"Letter"; break; |
| case ucp_M: typename = US"Mark"; break; |
| case ucp_N: typename = US"Number"; break; |
| case ucp_P: typename = US"Punctuation"; break; |
| case ucp_S: typename = US"Symbol"; break; |
| case ucp_Z: typename = US"Separator"; break; |
| } |
| |
| switch (fulltype) |
| { |
| case ucp_Cc: fulltypename = US"Control"; break; |
| case ucp_Cf: fulltypename = US"Format"; break; |
| case ucp_Cn: fulltypename = US"Unassigned"; break; |
| case ucp_Co: fulltypename = US"Private use"; break; |
| case ucp_Cs: fulltypename = US"Surrogate"; break; |
| case ucp_Ll: fulltypename = US"Lower case letter"; break; |
| case ucp_Lm: fulltypename = US"Modifier letter"; break; |
| case ucp_Lo: fulltypename = US"Other letter"; break; |
| case ucp_Lt: fulltypename = US"Title case letter"; break; |
| case ucp_Lu: fulltypename = US"Upper case letter"; break; |
| case ucp_Mc: fulltypename = US"Spacing mark"; break; |
| case ucp_Me: fulltypename = US"Enclosing mark"; break; |
| case ucp_Mn: fulltypename = US"Non-spacing mark"; break; |
| case ucp_Nd: fulltypename = US"Decimal number"; break; |
| case ucp_Nl: fulltypename = US"Letter number"; break; |
| case ucp_No: fulltypename = US"Other number"; break; |
| case ucp_Pc: fulltypename = US"Connector punctuation"; break; |
| case ucp_Pd: fulltypename = US"Dash punctuation"; break; |
| case ucp_Pe: fulltypename = US"Close punctuation"; break; |
| case ucp_Pf: fulltypename = US"Final punctuation"; break; |
| case ucp_Pi: fulltypename = US"Initial punctuation"; break; |
| case ucp_Po: fulltypename = US"Other punctuation"; break; |
| case ucp_Ps: fulltypename = US"Open punctuation"; break; |
| case ucp_Sc: fulltypename = US"Currency symbol"; break; |
| case ucp_Sk: fulltypename = US"Modifier symbol"; break; |
| case ucp_Sm: fulltypename = US"Mathematical symbol"; break; |
| case ucp_So: fulltypename = US"Other symbol"; break; |
| case ucp_Zl: fulltypename = US"Line separator"; break; |
| case ucp_Zp: fulltypename = US"Paragraph separator"; break; |
| case ucp_Zs: fulltypename = US"Space separator"; break; |
| } |
| |
| switch(gbprop) |
| { |
| case ucp_gbCR: graphbreak = US"CR"; break; |
| case ucp_gbLF: graphbreak = US"LF"; break; |
| case ucp_gbControl: graphbreak = US"Control"; break; |
| case ucp_gbExtend: graphbreak = US"Extend"; break; |
| case ucp_gbPrepend: graphbreak = US"Prepend"; break; |
| case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break; |
| case ucp_gbL: graphbreak = US"Hangul syllable type L"; break; |
| case ucp_gbV: graphbreak = US"Hangul syllable type V"; break; |
| case ucp_gbT: graphbreak = US"Hangul syllable type T"; break; |
| case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break; |
| case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break; |
| case ucp_gbRegionalIndicator: |
| graphbreak = US"Regional Indicator"; break; |
| case ucp_gbOther: graphbreak = US"Other"; break; |
| case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break; |
| case ucp_gbExtended_Pictographic: |
| graphbreak = US"Extended Pictographic"; break; |
| default: graphbreak = US"Unknown"; break; |
| } |
| |
| switch(script) |
| { |
| case ucp_Unknown: scriptname = US"Unknown"; break; |
| case ucp_Arabic: scriptname = US"Arabic"; break; |
| case ucp_Armenian: scriptname = US"Armenian"; break; |
| case ucp_Balinese: scriptname = US"Balinese"; break; |
| case ucp_Bengali: scriptname = US"Bengali"; break; |
| case ucp_Bopomofo: scriptname = US"Bopomofo"; break; |
| case ucp_Braille: scriptname = US"Braille"; break; |
| case ucp_Buginese: scriptname = US"Buginese"; break; |
| case ucp_Buhid: scriptname = US"Buhid"; break; |
| case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break; |
| case ucp_Cherokee: scriptname = US"Cherokee"; break; |
| case ucp_Common: scriptname = US"Common"; break; |
| case ucp_Coptic: scriptname = US"Coptic"; break; |
| case ucp_Cuneiform: scriptname = US"Cuneiform"; break; |
| case ucp_Cypriot: scriptname = US"Cypriot"; break; |
| case ucp_Cyrillic: scriptname = US"Cyrillic"; break; |
| case ucp_Deseret: scriptname = US"Deseret"; break; |
| case ucp_Devanagari: scriptname = US"Devanagari"; break; |
| case ucp_Ethiopic: scriptname = US"Ethiopic"; break; |
| case ucp_Georgian: scriptname = US"Georgian"; break; |
| case ucp_Glagolitic: scriptname = US"Glagolitic"; break; |
| case ucp_Gothic: scriptname = US"Gothic"; break; |
| case ucp_Greek: scriptname = US"Greek"; break; |
| case ucp_Gujarati: scriptname = US"Gujarati"; break; |
| case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break; |
| case ucp_Han: scriptname = US"Han"; break; |
| case ucp_Hangul: scriptname = US"Hangul"; break; |
| case ucp_Hanunoo: scriptname = US"Hanunoo"; break; |
| case ucp_Hebrew: scriptname = US"Hebrew"; break; |
| case ucp_Hiragana: scriptname = US"Hiragana"; break; |
| case ucp_Inherited: scriptname = US"Inherited"; break; |
| case ucp_Kannada: scriptname = US"Kannada"; break; |
| case ucp_Katakana: scriptname = US"Katakana"; break; |
| case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break; |
| case ucp_Khmer: scriptname = US"Khmer"; break; |
| case ucp_Lao: scriptname = US"Lao"; break; |
| case ucp_Latin: scriptname = US"Latin"; break; |
| case ucp_Limbu: scriptname = US"Limbu"; break; |
| case ucp_Linear_B: scriptname = US"Linear_B"; break; |
| case ucp_Malayalam: scriptname = US"Malayalam"; break; |
| case ucp_Mongolian: scriptname = US"Mongolian"; break; |
| case ucp_Myanmar: scriptname = US"Myanmar"; break; |
| case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break; |
| case ucp_Nko: scriptname = US"Nko"; break; |
| case ucp_Ogham: scriptname = US"Ogham"; break; |
| case ucp_Old_Italic: scriptname = US"Old_Italic"; break; |
| case ucp_Old_Persian: scriptname = US"Old_Persian"; break; |
| case ucp_Oriya: scriptname = US"Oriya"; break; |
| case ucp_Osmanya: scriptname = US"Osmanya"; break; |
| case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break; |
| case ucp_Phoenician: scriptname = US"Phoenician"; break; |
| case ucp_Runic: scriptname = US"Runic"; break; |
| case ucp_Shavian: scriptname = US"Shavian"; break; |
| case ucp_Sinhala: scriptname = US"Sinhala"; break; |
| case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break; |
| case ucp_Syriac: scriptname = US"Syriac"; break; |
| case ucp_Tagalog: scriptname = US"Tagalog"; break; |
| case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break; |
| case ucp_Tai_Le: scriptname = US"Tai_Le"; break; |
| case ucp_Tamil: scriptname = US"Tamil"; break; |
| case ucp_Telugu: scriptname = US"Telugu"; break; |
| case ucp_Thaana: scriptname = US"Thaana"; break; |
| case ucp_Thai: scriptname = US"Thai"; break; |
| case ucp_Tibetan: scriptname = US"Tibetan"; break; |
| case ucp_Tifinagh: scriptname = US"Tifinagh"; break; |
| case ucp_Ugaritic: scriptname = US"Ugaritic"; break; |
| case ucp_Yi: scriptname = US"Yi"; break; |
| /* New for Unicode 5.1: */ |
| case ucp_Carian: scriptname = US"Carian"; break; |
| case ucp_Cham: scriptname = US"Cham"; break; |
| case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break; |
| case ucp_Lepcha: scriptname = US"Lepcha"; break; |
| case ucp_Lycian: scriptname = US"Lycian"; break; |
| case ucp_Lydian: scriptname = US"Lydian"; break; |
| case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break; |
| case ucp_Rejang: scriptname = US"Rejang"; break; |
| case ucp_Saurashtra: scriptname = US"Saurashtra"; break; |
| case ucp_Sundanese: scriptname = US"Sundanese"; break; |
| case ucp_Vai: scriptname = US"Vai"; break; |
| /* New for Unicode 5.2: */ |
| case ucp_Avestan: scriptname = US"Avestan"; break; |
| case ucp_Bamum: scriptname = US"Bamum"; break; |
| case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break; |
| case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break; |
| case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break; |
| case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break; |
| case ucp_Javanese: scriptname = US"Javanese"; break; |
| case ucp_Kaithi: scriptname = US"Kaithi"; break; |
| case ucp_Lisu: scriptname = US"Lisu"; break; |
| case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break; |
| case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break; |
| case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break; |
| case ucp_Samaritan: scriptname = US"Samaritan"; break; |
| case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break; |
| case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break; |
| /* New for Unicode 6.0.0 */ |
| case ucp_Batak: scriptname = US"Batak"; break; |
| case ucp_Brahmi: scriptname = US"Brahmi"; break; |
| case ucp_Mandaic: scriptname = US"Mandaic"; break; |
| |
| /* New for Unicode 6.1.0 */ |
| case ucp_Chakma: scriptname = US"Chakma"; break; |
| case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break; |
| case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break; |
| case ucp_Miao: scriptname = US"Miao"; break; |
| case ucp_Sharada: scriptname = US"Sharada"; break; |
| case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break; |
| case ucp_Takri: scriptname = US"Takri"; break; |
| |
| /* New for Unicode 7.0.0 */ |
| case ucp_Bassa_Vah: scriptname = US"Bassa_Vah"; break; |
| case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break; |
| case ucp_Duployan: scriptname = US"Duployan"; break; |
| case ucp_Elbasan: scriptname = US"Elbasan"; break; |
| case ucp_Grantha: scriptname = US"Grantha"; break; |
| case ucp_Khojki: scriptname = US"Khojki"; break; |
| case ucp_Khudawadi: scriptname = US"Khudawadi"; break; |
| case ucp_Linear_A: scriptname = US"Linear_A"; break; |
| case ucp_Mahajani: scriptname = US"Mahajani"; break; |
| case ucp_Manichaean: scriptname = US"Manichaean"; break; |
| case ucp_Mende_Kikakui: scriptname = US"Mende_Kikakui"; break; |
| case ucp_Modi: scriptname = US"Modi"; break; |
| case ucp_Mro: scriptname = US"Mro"; break; |
| case ucp_Nabataean: scriptname = US"Nabataean"; break; |
| case ucp_Old_North_Arabian: scriptname = US"Old_North_Arabian"; break; |
| case ucp_Old_Permic: scriptname = US"Old_Permic"; break; |
| case ucp_Pahawh_Hmong: scriptname = US"Pahawh_Hmong"; break; |
| case ucp_Palmyrene: scriptname = US"Palmyrene"; break; |
| case ucp_Psalter_Pahlavi: scriptname = US"Psalter_Pahlavi"; break; |
| case ucp_Pau_Cin_Hau: scriptname = US"Pau_Cin_Hau"; break; |
| case ucp_Siddham: scriptname = US"Siddham"; break; |
| case ucp_Tirhuta: scriptname = US"Tirhuta"; break; |
| case ucp_Warang_Citi: scriptname = US"Warang_Citi"; break; |
| |
| /* New for Unicode 8.0.0 */ |
| case ucp_Ahom: scriptname = US"Ahom"; break; |
| case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break; |
| case ucp_Hatran: scriptname = US"Hatran"; break; |
| case ucp_Multani: scriptname = US"Multani"; break; |
| case ucp_Old_Hungarian: scriptname = US"Old_Hungarian"; break; |
| case ucp_SignWriting: scriptname = US"SignWriting"; break; |
| |
| /* New for Unicode 10.0.0 (no update since 8.0.0) */ |
| case ucp_Adlam: scriptname = US"Adlam"; break; |
| case ucp_Bhaiksuki: scriptname = US"Bhaiksuki"; break; |
| case ucp_Marchen: scriptname = US"Marchen"; break; |
| case ucp_Newa: scriptname = US"Newa"; break; |
| case ucp_Osage: scriptname = US"Osage"; break; |
| case ucp_Tangut: scriptname = US"Tangut"; break; |
| case ucp_Masaram_Gondi: scriptname = US"Masaram_Gondi"; break; |
| case ucp_Nushu: scriptname = US"Nushu"; break; |
| case ucp_Soyombo: scriptname = US"Soyombo"; break; |
| case ucp_Zanabazar_Square: scriptname = US"Zanabazar_Square"; break; |
| |
| /* New for Unicode 11.0.0 */ |
| case ucp_Dogra: scriptname = US"Dogra"; break; |
| case ucp_Gunjala_Gondi: scriptname = US"Gunjala_Gondi"; break; |
| case ucp_Hanifi_Rohingya: scriptname = US"Hanifi_Rohingya"; break; |
| case ucp_Makasar: scriptname = US"Makasar"; break; |
| case ucp_Medefaidrin: scriptname = US"Medefaidrin"; break; |
| case ucp_Old_Sogdian: scriptname = US"Old_Sogdian"; break; |
| case ucp_Sogdian: scriptname = US"Sogdian"; break; |
| } |
| |
| printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); |
| if (othercase != c) |
| { |
| printf(", %04x", othercase); |
| if (caseset != 0) |
| { |
| const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1; |
| while (*(++p) < NOTACHAR) |
| if (*p != othercase && *p != c) printf(", %04x", *p); |
| } |
| } |
| printf("\n"); |
| } |
| |
| |
| |
| /************************************************* |
| * Main program * |
| *************************************************/ |
| |
| int |
| main(void) |
| { |
| unsigned char buffer[1024]; |
| while (fgets(CS buffer, sizeof(buffer), stdin) != NULL) |
| { |
| unsigned char name[24]; |
| unsigned char *s, *t; |
| |
| printf("%s", buffer); |
| s = buffer; |
| while (isspace(*s)) s++; |
| if (*s == 0) continue; |
| |
| for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; |
| *t = 0; |
| while (isspace(*s)) s++; |
| |
| if (strcmp(CS name, "findprop") == 0) |
| { |
| while (*s != 0) |
| { |
| unsigned char *endptr; |
| int c = strtoul(CS s, CSS(&endptr), 16); |
| print_prop(c); |
| s = endptr; |
| while (isspace(*s)) s++; |
| } |
| } |
| |
| else printf("Unknown test command %s\n", name); |
| } |
| |
| return 0; |
| } |
| |
| /* End */ |