android / platform / external / icu4c.git / 1a77b9a60c9b34a0a0c54c8015515e5d1d4c9e3c / . / test / testdata / collationtest.txt

# Copyright (c) 2012-2014 International Business Machines | |

# Corporation and others. All Rights Reserved. | |

# | |

# This file should be in UTF-8 with a signature byte sequence ("BOM"). | |

# | |

# collationtest.txt: Collation test data. | |

# | |

# created on: 2012apr13 | |

# created by: Markus W. Scherer | |

# A line with "** test: description" is used for verbose and error output. | |

# A collator can be set with "@ root" or "@ locale language-tag", | |

# for example "@ locale de-u-co-phonebk". | |

# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". | |

# A collator can be built with "@ rules". | |

# An "@ rules" line is followed by one or more lines with the tailoring rules. | |

# A collator can be modified with "% attribute=value". | |

# "* compare" tests the order (= or <) of the following strings. | |

# The relation can be "=" or "<" (the level of the difference is not specified) | |

# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). | |

# Test sections ("* compare") are terminated by | |

# definitions of new collators, changing attributes, or new test sections. | |

** test: simple CEs & expansions | |

# Many types of mappings are tested elsewhere, including via the UCA conformance tests. | |

# Here we mostly cover a few unusual mappings. | |

@ rules | |

&\x01 # most control codes are ignorable | |

<<<\u0300 # tertiary CE | |

&9<\x00 # NUL not ignorable | |

&\uA00A\uA00B=\uA002 # two long-primary CEs | |

&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits | |

* compare | |

= \x01 | |

= \x02 | |

<3 \u0300 | |

<1 9 | |

<1 \x00 | |

= \x01\x00\x02 | |

<1 a | |

<3 a\u0300 | |

<2 a\u0308 | |

= ä | |

<1 b | |

<1 か # Hiragana Ka (U+304B) | |

<2 か\u3099 # plus voiced sound mark | |

= が # Hiragana Ga (U+304C) | |

<1 \uA00A\uA00B | |

= \uA002 | |

<1 \uA00A\uA00B\u00050004 | |

<1 \uA00A\uA00B\u00050005 | |

= \uA003 | |

<1 \uA00A\uA00B\u00050006 | |

** test: contractions | |

# Create some interesting mappings, and map some normalization-inert characters | |

# (which are not subject to canonical reordering) | |

# to some of the same CEs to check the sequence of CEs. | |

@ rules | |

# Contractions starting with 'a' should not continue with any character < U+0300 | |

# so that we can test a shortcut for that. | |

&a=ⓐ | |

&b<bz=ⓑ | |

&d<dz\u0301=ⓓ # d+z+acute | |

&z | |

<a\u0301=Ⓐ # a+acute sorts after z | |

<a\u0301\u0301=Ⓑ # a+acute+acute | |

<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right | |

<a\u030a=Ⓓ # a+ring | |

<a\u0323=Ⓔ # a+dot below | |

<a\u0323\u0358=Ⓕ # a+dot below+dot above right | |

<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring | |

<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z | |

&\U0001D158=⁰ # musical notehead black (has a symbol primary) | |

<\U0001D158\U0001D165=¼ # musical quarter note | |

# deliberately missing prefix contractions: | |

# dz | |

# a\u0327 | |

# a\u0327\u0323 | |

# a\u0327\u0323b | |

&\x01 | |

<<<\U0001D165=¹ # musical stem (ccc=216) | |

<<<\U0001D16D=² # musical augmentation dot (ccc=226) | |

<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) | |

&\u0301=❶ # acute (ccc=230) | |

&\u030a=❷ # ring (ccc=230) | |

&\u0308=❸ # diaeresis (ccc=230) | |

<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) | |

&\u0327=❺ # cedilla (ccc=202) | |

&\u0323=❻ # dot below (ccc=220) | |

&\u0331=❼ # macron below (ccc=220) | |

<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) | |

&\u0334=❾ # tilde overlay (ccc=1) | |

&\u0358=❿ # dot above right (ccc=232) | |

&\u0f71=① # tibetan vowel sign aa | |

&\u0f72=② # tibetan vowel sign i | |

# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 | |

&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) | |

** test: simple contractions | |

# Some strings are chosen to cause incremental contiguous contraction matching to | |

# go into partial matches for prefixes of contractions | |

# (where the prefixes are deliberately not also contractions). | |

# When there is no complete match, then the matching code must back out of those | |

# so that discontiguous contractions work as specified. | |

* compare | |

# contraction starter with no following text, or mismatch, or blocked | |

<1 a | |

= ⓐ | |

<1 aa | |

= ⓐⓐ | |

<1 ab | |

= ⓐb | |

<1 az | |

= ⓐz | |

* compare | |

<1 a | |

<2 a\u0308\u030a # ring blocked by diaeresis | |

= ⓐ❸❷ | |

<2 a\u0327 | |

= ⓐ❺ | |

* compare | |

<2 \u0308 | |

= ❸ | |

<2 \u0308\u030a\u0301 # acute blocked by ring | |

= ❸❷❶ | |

* compare | |

<1 \U0001D158 | |

= ⁰ | |

<1 \U0001D158\U0001D165 | |

= ¼ | |

# no discontiguous contraction because of missing prefix contraction d+z, | |

# and a starter ('z') after the 'd' | |

* compare | |

<1 dz\u0323\u0301 | |

= dz❻❶ | |

# contiguous contractions | |

* compare | |

<1 abz | |

= ⓐⓑ | |

<1 abzz | |

= ⓐⓑz | |

* compare | |

<1 a | |

<1 z | |

<1 a\u0301 | |

= Ⓐ | |

<1 a\u0301\u0301 | |

= Ⓑ | |

<1 a\u0301\u0301\u0358 | |

= Ⓒ | |

<1 a\u030a | |

= Ⓓ | |

<1 a\u0323\u0358 | |

= Ⓕ | |

<1 a\u0327\u0323\u030a # match despite missing prefix | |

= Ⓖ | |

<1 a\u0327\u0323bz | |

= Ⓗ | |

* compare | |

<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second | |

= ❸❹ | |

* compare | |

<1 \U0001D158\U0001D165 | |

= ¼ | |

* compare | |

<3 \U0001D165\U0001D16D | |

= ³ | |

** test: discontiguous contractions | |

* compare | |

<1 a\u0327\u030a # a+ring skips cedilla | |

= Ⓓ❺ | |

<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas | |

= Ⓓ❺❺ | |

<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas | |

= Ⓓ❺❺❺ | |

<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas | |

= Ⓓ❾❺❺ | |

<1 a\u0327\u0323 # a+dot below skips cedilla | |

= Ⓔ❺ | |

<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute | |

= Ⓕ❶ | |

<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay | |

= Ⓕ❾ | |

* compare | |

<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below | |

= ❽❼ | |

* compare | |

<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) | |

= Ⓓ❺❼❻ | |

<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla | |

= Ⓔ❺²❷ | |

<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas | |

= Ⓔ❺❺❷ | |

<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla | |

= Ⓔ❺❻❷ | |

<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla | |

= Ⓔ❾❺❷ | |

* compare | |

<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla | |

= ¼❺ | |

<1 a\U0001D165\u0323 # a+dot below skips stem | |

= Ⓔ¹ | |

# partial contiguous match, backs up, matches discontiguous contraction | |

<1 a\u0327\u0323b | |

= Ⓔ❺b | |

<1 a\u0327\u0323ba | |

= Ⓔ❺bⓐ | |

# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks | |

* compare | |

<1 a\u0327\u0301\u0301\u0358 | |

= Ⓒ❺ | |

# FCD but not NFD | |

* compare | |

<1 a\u0f73\u0301 # a+acute skips tibetan ii | |

= Ⓐ③ | |

# FCD but the 0f71 inside the 0f73 must be skipped | |

# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 | |

* compare | |

<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 | |

= ③① | |

** test: discontiguous contractions with nested contractions | |

* compare | |

<1 a\u0323\u0308\u0301\u0358 | |

= Ⓕ❹ | |

<2 a\u0323\u0308\u0301\u0308\u0301\u0358 | |

= Ⓕ❹❹ | |

** test: discontiguous contractions with interleaved contractions | |

* compare | |

# a+ring & cedilla & macron below+dot above right | |

<1 a\u0327\u0331\u030a\u0358 | |

= Ⓓ❺❽ | |

# a+ring & 1x..3x macron below+dot above right | |

<2 a\u0331\u030a\u0358 | |

= Ⓓ❽ | |

<2 a\u0331\u0331\u030a\u0358\u0358 | |

= Ⓓ❽❽ | |

# also skips acute | |

<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 | |

= Ⓓ❽❽❽❶ | |

# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute | |

<1 a\U0001D165\u0323\U0001D16Ddz\u0301 | |

= Ⓔ³ⓓ | |

** test: some simple string comparisons | |

@ root | |

* compare | |

# first string compares against "" | |

= \u0000 | |

< a | |

<1 b | |

<3 B | |

= \u0000B\u0000 | |

** test: compare with strength=primary | |

% strength=primary | |

* compare | |

<1 a | |

<1 b | |

= B | |

** test: compare with strength=secondary | |

% strength=secondary | |

* compare | |

<1 a | |

<1 b | |

= B | |

** test: compare with strength=tertiary | |

% strength=tertiary | |

* compare | |

<1 a | |

<1 b | |

<3 B | |

** test: compare with strength=quaternary | |

% strength=quaternary | |

* compare | |

<1 a | |

<1 b | |

<3 B | |

** test: compare with strength=identical | |

% strength=identical | |

* compare | |

<1 a | |

<1 b | |

<3 B | |

** test: côté with forwards secondary | |

@ root | |

* compare | |

<1 cote | |

<2 coté | |

<2 côte | |

<2 côté | |

** test: côté with forwards secondary vs. U+FFFE merge separator | |

# Merged sort keys: On each level, any difference in the first segment | |

# must trump any further difference. | |

* compare | |

<1 cote\uFFFEcôté | |

<2 coté\uFFFEcôte | |

<2 côte\uFFFEcoté | |

<2 côté\uFFFEcote | |

** test: côté with backwards secondary | |

% backwards=on | |

* compare | |

<1 cote | |

<2 côte | |

<2 coté | |

<2 côté | |

** test: côté with backwards secondary vs. U+FFFE merge separator | |

# Merged sort keys: On each level, any difference in the first segment | |

# must trump any further difference. | |

* compare | |

<1 cote\uFFFEcôté | |

<2 côte\uFFFEcoté | |

<2 coté\uFFFEcôte | |

<2 côté\uFFFEcote | |

** test: U+FFFE on identical level | |

@ root | |

% strength=identical | |

* compare | |

# All of these control codes are completely-ignorable, so that | |

# their low code points are compared with the merge separator. | |

# The merge separator must compare less than any other character. | |

<1 \uFFFE\u0001\u0002\u0003 | |

<i \u0001\uFFFE\u0002\u0003 | |

<i \u0001\u0002\uFFFE\u0003 | |

<i \u0001\u0002\u0003\uFFFE | |

* compare | |

# The merge separator must even compare less than U+0000. | |

<1 \uFFFE\u0000\u0000 | |

<i \u0000\uFFFE\u0000 | |

<i \u0000\u0000\uFFFE | |

** test: Hani < surrogates < U+FFFD | |

# Note: compareUTF8() treats unpaired surrogates like U+FFFD, | |

# so with that the strings with surrogates will compare equal to each other | |

# and equal to the string with U+FFFD. | |

@ root | |

% strength=identical | |

* compare | |

<1 abz | |

<1 a\u4e00z | |

<1 a\U00020000z | |

<1 a\ud800z | |

<1 a\udbffz | |

<1 a\udc00z | |

<1 a\udfffz | |

<1 a\ufffdz | |

** test: script reordering | |

@ root | |

% reorder Hani Zzzz digit | |

* compare | |

<1 ? | |

<1 + | |

<1 丂 | |

<1 a | |

<1 α | |

<1 5 | |

% reorder default | |

* compare | |

<1 ? | |

<1 + | |

<1 5 | |

<1 a | |

<1 α | |

<1 丂 | |

** test: empty rules | |

@ rules | |

* compare | |

<1 a | |

<2 ä | |

<3 Ä | |

<1 b | |

** test: very simple rules | |

@ rules | |

&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z | |

% strength=quaternary | |

* compare | |

<1 a | |

= e | |

<4 q | |

<4 r | |

<1 x | |

<3 X | |

<2 y | |

<3 Y | |

<2 z | |

<3 Z | |

** test: tailoring twice before a root position: primary | |

@ rules | |

&[before 1]b<p | |

&[before 1]b<q | |

* compare | |

<1 a | |

<1 p | |

<1 q | |

<1 b | |

** test: tailoring twice before a root position: secondary | |

@ rules | |

&[before 2]ſ<<p | |

&[before 2]ſ<<q | |

* compare | |

<1 s | |

<2 p | |

<2 q | |

<2 ſ | |

# secondary-before common weight | |

@ rules | |

&[before 2]b<<p | |

&[before 2]b<<q | |

* compare | |

<1 a | |

<1 p | |

<2 q | |

<2 b | |

** test: tailoring twice before a root position: tertiary | |

@ rules | |

&[before 3]B<<<p | |

&[before 3]B<<<q | |

* compare | |

<1 b | |

<3 p | |

<3 q | |

<3 B | |

# tertiary-before common weight | |

@ rules | |

&[before 3]b<<<p | |

&[before 3]b<<<q | |

* compare | |

<1 a | |

<1 p | |

<3 q | |

<3 b | |

@ rules | |

&[before 2]b<<s | |

&[before 3]s<<<p | |

&[before 3]s<<<q | |

* compare | |

<1 a | |

<1 p | |

<3 q | |

<3 s | |

<2 b | |

** test: tailor after completely ignorable | |

@ rules | |

&\x00<<<x<<y | |

* compare | |

= \x00 | |

= \x1F | |

<3 x | |

<2 y | |

** test: secondary tailoring gaps, ICU ticket 9362 | |

@ rules | |

&[before 2]s<<'_' | |

&s<<r # secondary between s and ſ (long s) | |

&ſ<<*a-q # more than 15 between ſ and secondary CE boundary | |

&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE | |

&[last primary ignorable]<<y<<z | |

* compare | |

<2 u | |

<2 v | |

<2 \u0332 # lowest secondary CE | |

<2 \u0308 | |

<2 y | |

<2 z | |

<1 s_ | |

<2 ss | |

<2 sr | |

<2 sſ | |

<2 sa | |

<2 sb | |

<2 sp | |

<2 sq | |

<2 sus | |

<2 svs | |

<2 rs | |

** test: tertiary tailoring gaps, ICU ticket 9362 | |

@ rules | |

&[before 3]t<<<'_' | |

&t<<<r # tertiary between t and fullwidth t | |

&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary | |

&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE | |

&[last secondary ignorable]<<<y<<<z | |

* compare | |

<3 u | |

<3 v | |

# Note: The root collator currently does not map any characters to tertiary CEs. | |

<3 y | |

<3 z | |

<1 t_ | |

<3 tt | |

<3 tr | |

<3 tｔ | |

<3 tᵀ | |

<3 ta | |

<3 tb | |

<3 tp | |

<3 tq | |

<3 tut | |

<3 tvt | |

<3 rt | |

** test: secondary & tertiary around root character | |

@ rules | |

&[before 2]m<<r | |

&m<<s | |

&[before 3]m<<<u | |

&m<<<v | |

* compare | |

<1 l | |

<1 r | |

<2 u | |

<3 m | |

<3 v | |

<2 s | |

<1 n | |

** test: secondary & tertiary around tailored item | |

@ rules | |

&m<x | |

&[before 2]x<<r | |

&x<<s | |

&[before 3]x<<<u | |

&x<<<v | |

* compare | |

<1 m | |

<1 r | |

<2 u | |

<3 x | |

<3 v | |

<2 s | |

<1 n | |

** test: more nesting of secondary & tertiary before | |

@ rules | |

&[before 3]m<<<u | |

&[before 2]m<<r | |

&[before 3]r<<<q | |

&m<<<w | |

&m<<t | |

&[before 3]w<<<v | |

&w<<<x | |

&w<<s | |

* compare | |

<1 l | |

<1 q | |

<3 r | |

<2 u | |

<3 m | |

<3 v | |

<3 w | |

<3 x | |

<2 s | |

<2 t | |

<1 n | |

** test: case bits | |

@ rules | |

&w<x # tailored CE getting case bits | |

=uv=uV=Uv=UV # 2 chars -> 1 CE | |

&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs | |

&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs | |

% caseFirst=lower | |

* compare | |

<1 ae | |

= ch | |

<3 cH | |

<3 Ch | |

<3 CH | |

<1 rst | |

= yz | |

<3 yZ | |

<3 Yz | |

<3 YZ | |

<1 w | |

<1 x | |

= uv | |

<3 uV | |

= Uv # mixed case on single CE cannot distinguish variations | |

<3 UV | |

** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower | |

@ rules | |

&\u0001<<<t<<<T # tertiary CEs | |

% caseFirst=lower | |

* compare | |

<1 aa | |

<3 aat | |

<3 aaT | |

<3 aA | |

<3 aAt | |

<3 ata | |

<3 aTa | |

** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper | |

% caseFirst=upper | |

* compare | |

<1 aA | |

<3 aAt | |

<3 aa | |

<3 aat | |

<3 aaT | |

<3 ata | |

<3 aTa | |

** test: reset on expansion, ICU tickets 9415 & 9593 | |

@ rules | |

&æ<x # tailor the last primary CE so that x sorts between ae and af | |

&æb=bæ # copy all reset CEs to make bæ sort the same | |

&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 | |

&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference | |

&l·=z # handle the pre-context for · when fetching reset CEs | |

<<u # copy/tailor 2 CEs | |

* compare | |

<1 ae | |

<2 æ | |

<1 x | |

<1 af | |

* compare | |

<1 aeb | |

<2 æb | |

= bæ | |

* compare | |

<1 각 | |

<1 h | |

<1 갂 | |

<1 갃 | |

* compare | |

<1 · # by itself: primary CE | |

<1 l | |

<2 l· # l+middle dot has only a secondary difference from l | |

= z | |

<2 u | |

* compare | |

<1 (13) | |

<3 ⒀ # DUCET sets special tertiary weights in all CEs | |

<2 y | |

<1 (13[ | |

% alternate=shifted | |

* compare | |

<1 (13) | |

= 13 | |

<3 ⒀ | |

= y # alternate=shifted removes the tailoring difference on the last CE | |

<1 14 | |

** test: contraction inside extension, ICU ticket 9378 | |

@ rules | |

&а<<х/й # all letters are Cyrillic | |

* compare | |

<1 ай | |

<2 х | |

** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 | |

@ rules | |

&t<x &ᵀ<y # same primary weights | |

&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent | |

* compare | |

<1 q | |

<1 u | |

<1 v | |

<1 ꝗ | |

<1 t | |

<3 ᵀ | |

<1 y | |

<1 x | |

# Principle: Each rule builds on the state of preceding rules and ignores following rules. | |

** test: later rule does not affect earlier reset position, ICU ticket 10105 | |

@ rules | |

&a < u < v < w &ov < x &b < v | |

* compare | |

<1 oa | |

<1 ou | |

<1 x # CE(o) followed by CE between u and w | |

<1 ow | |

<1 ob | |

<1 ov | |

** test: later rule does not affect earlier extension (1), ICU ticket 10105 | |

@ rules | |

&a=x/b &v=b | |

% strength=secondary | |

* compare | |

<1 B | |

<1 c | |

<1 v | |

= b | |

* compare | |

<1 AB | |

= x | |

<1 ac | |

<1 av | |

= ab | |

** test: later rule does not affect earlier extension (2), ICU ticket 10105 | |

@ rules | |

&a <<< c / e &g <<< e / l | |

% strength=secondary | |

* compare | |

<1 AE | |

= c | |

<2 æ | |

<1 agl | |

= ae | |

** test: later rule does not affect earlier extension (3), ICU ticket 10105 | |

@ rules | |

&a = b / c &d = c / e | |

% strength=secondary | |

* compare | |

<1 AC # C is still only tertiary different from the original c | |

= b | |

<1 ade | |

= ac | |

** test: extension contains tailored character, ICU ticket 10105 | |

@ rules | |

&a=e &b=u/e | |

* compare | |

<1 a | |

= e | |

<1 ba | |

= be | |

= u | |

** test: add simple mappings for characters with root context | |

@ rules | |

&z=· # middle dot has a prefix mapping in the CLDR root | |

&n=и # и (U+0438) has contractions in the root | |

* compare | |

<1 l | |

<2 l· # root mapping for l|· still works | |

<1 z | |

= · | |

* compare | |

<1 n | |

= и | |

<1 И | |

<1 и\u0306 # root mapping for й=и\u0306 still works | |

= й | |

<3 Й | |

** test: add context mappings around characters with root context | |

@ rules | |

&z=·h # middle dot has a prefix mapping in the CLDR root | |

&n=ә|и # и (U+0438) has contractions in the root | |

* compare | |

<1 l | |

<2 l· # root mapping for l|· still works | |

<1 z | |

= ·h | |

* compare | |

<1 и | |

<3 И | |

<1 и\u0306 # root mapping for й=и\u0306 still works | |

= й | |

* compare | |

<1 әn | |

= әи | |

<1 әo | |

** test: many secondary CEs at the top of their range | |

@ rules | |

&[last primary ignorable]<<*\u2801-\u28ff | |

* compare | |

<2 \u0308 | |

<2 \u2801 | |

<2 \u2802 | |

<2 \u2803 | |

<2 \u2804 | |

<2 \u28fd | |

<2 \u28fe | |

<2 \u28ff | |

<1 \x20 | |

** test: many tertiary CEs at the top of their range | |

@ rules | |

&[last secondary ignorable]<<<*a-z | |

* compare | |

<3 a | |

<3 b | |

<3 c | |

<3 d | |

# e..w | |

<3 x | |

<3 y | |

<3 z | |

<2 \u0308 | |

** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 | |

@ rules | |

&a=p|x &b=px &c=op | |

* compare | |

<1 b | |

= px | |

<3 B | |

<1 c | |

= op | |

<3 C | |

* compare | |

<1 ca | |

= opx # first contraction op, then prefix p|x | |

<3 cA | |

<3 Ca | |

** test: reset position with prefix (pre-context), ICU ticket 10102 | |

@ rules | |

&a=p|x &px=y | |

* compare | |

<1 pa | |

= px | |

= y | |

<3 pA | |

<1 q | |

<1 x | |

** test: prefix+contraction together (1), ICU ticket 10071 | |

@ rules | |

&x=a|bc | |

* compare | |

<1 ab | |

<1 Abc | |

<1 abd | |

<1 ac | |

<1 aw | |

<1 ax | |

= abc | |

<3 aX | |

<3 Ax | |

<1 b | |

<1 bb | |

<1 bc | |

<3 bC | |

<3 Bc | |

<1 bd | |

** test: prefix+contraction together (2), ICU ticket 10071 | |

@ rules | |

&w=bc &x=a|b | |

* compare | |

<1 w | |

= bc | |

<3 W | |

* compare | |

<1 aw | |

<1 ax | |

= ab | |

<3 aX | |

<1 axb | |

<1 axc | |

= abc # prefix match a|b takes precedence over contraction match bc | |

<3 abC | |

<1 abd | |

<1 ay | |

** test: prefix+contraction together (3), ICU ticket 10071 | |

@ rules | |

&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here | |

* compare # same "compare" sequences as previous test | |

<1 w | |

= bc | |

<3 W | |

* compare | |

<1 aw | |

<1 ax | |

= ab | |

<3 aX | |

<1 axb | |

<1 axc | |

= abc # prefix match a|b takes precedence over contraction match bc | |

<3 abC | |

<1 abd | |

<1 ay | |

** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 | |

@ rules | |

&d=ch &v=p|ci | |

* compare | |

<1 pc | |

<3 pC | |

<1 pcH | |

<1 pcI | |

<1 pd | |

= pch # no-prefix contraction ch matches | |

<3 pD | |

<1 pv | |

= pci # prefix+contraction p|ci matches | |

<3 pV | |

** test: tailor in & around compact ranges of root primaries | |

# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs | |

# which should be reliably encoded as one range in the root elements data. | |

@ rules | |

&[before 1]ᚁ<a | |

&ᚁ<b | |

&[before 1]ᚂ<c | |

&ᚂ<d | |

&[before 1]ᚚ<y | |

&ᚚ<z | |

&[before 2]ᚁ<<r | |

&ᚁ<<s | |

&[before 3]ᚚ<<<t | |

&ᚚ<<<u | |

* compare | |

<1 ᣵ # U+18F5 last Canadian Aboriginal | |

<1 a | |

<1 r | |

<2 ᚁ | |

<2 s | |

<1 b | |

<1 c | |

<1 ᚂ | |

<1 d | |

<1 ᚃ | |

<1 ᚙ | |

<1 y | |

<1 t | |

<3 ᚚ | |

<3 u | |

<1 z | |

<1 ᚠ # U+16A0 first Runic | |

** test: suppressContractions | |

@ rules | |

&z<ch<әж [suppressContractions [·cә]] | |

* compare | |

<1 ch | |

<3 cH # ch was suppressed | |

<1 l | |

<1 l· # primary difference, not secondary, because l|· was suppressed | |

<1 ә | |

<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed | |

<1 әж | |

<3 әЖ | |

** test: Hangul & Jamo | |

@ rules | |

&L=\u1100 # first Jamo L | |

&V=\u1161 # first Jamo V | |

&T=\u11A8 # first Jamo T | |

&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs | |

* compare | |

<1 Lv | |

<3 LV | |

= \u1100\u1161 | |

= \uAC00 | |

<1 LVt | |

<3 LVT | |

= \u1100\u1161\u11A8 | |

= \uAC00\u11A8 | |

= \uAC01 | |

<2 LVT\u0308 | |

<2 \u4E00 | |

<2 \u4E01 | |

<2 \u4E80 | |

<2 \u4EFF | |

<2 LV\u0308T | |

<1 \uAC02 | |

** test: adjust special reset positions according to previous rules, CLDR ticket 6070 | |

@ rules | |

&[last variable]<x | |

[maxVariable space] # has effect only after building, no effect on following rules | |

&[last variable]<y | |

&[before 1][first regular]<z | |

* compare | |

<1 ? # some punctuation | |

<1 x | |

<1 y | |

<1 z | |

<1 $ # some symbol | |

@ rules | |

&[last primary ignorable]<<x<<<y | |

&[last primary ignorable]<<z | |

* compare | |

<2 \u0358 | |

<2 x | |

<3 y | |

<2 z | |

<1 \x20 | |

@ rules | |

&[last secondary ignorable]<<<x | |

&[last secondary ignorable]<<<y | |

* compare | |

<3 x | |

<3 y | |

<2 \u0358 | |

@ rules | |

&[before 2][first variable]<<z | |

&[before 2][first variable]<<y | |

&[before 3][first variable]<<<x | |

&[before 3][first variable]<<<w | |

&[before 1][first variable]<v | |

&[before 2][first variable]<<u | |

&[before 3][first variable]<<<t | |

&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary | |

* compare | |

<2 \u0358 | |

<1 s | |

<2 \uFDD1\xA0 | |

<1 t | |

<3 u | |

<2 v | |

<1 w | |

<3 x | |

<3 y | |

<2 z | |

<2 \t | |

@ rules | |

&[before 2][first regular]<<z | |

&[before 3][first regular]<<<y | |

&[before 1][first regular]<x | |

&[before 3][first regular]<<<w | |

&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary | |

&[before 3][first regular]<<<u | |

&[before 1][first regular]<p # primary before the boundary: becomes variable | |

&[before 3][first regular]<<<t # not affected by p | |

&[last variable]<q # after p! | |

* compare | |

<1 ? | |

<1 p | |

<1 q | |

<1 t | |

<3 u | |

<3 v | |

<1 w | |

<3 x | |

<1 y | |

<3 z | |

<1 $ | |

# check that p & q are indeed variable | |

% alternate=shifted | |

* compare | |

= ? | |

= p | |

= q | |

<1 t | |

<3 u | |

<3 v | |

<1 w | |

<3 x | |

<1 y | |

<3 z | |

<1 $ | |

@ rules | |

&[before 2][first trailing]<<z | |

&[before 1][first trailing]<y | |

&[before 3][first trailing]<<<x | |

* compare | |

<1 \u4E00 # first Han, first implicit | |

<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary | |

# Note: The root collator currently does not map any characters to the trailing first boundary primary. | |

<1 x | |

<3 y | |

<1 z | |

<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. | |

@ rules | |

&[before 2][first primary ignorable]<<z | |

&[before 2][first primary ignorable]<<y | |

&[before 3][first primary ignorable]<<<x | |

&[before 3][first primary ignorable]<<<w | |

* compare | |

= \x01 | |

<2 w | |

<3 x | |

<3 y | |

<2 z | |

<2 \u0301 | |

@ rules | |

&[before 3][first secondary ignorable]<<<y | |

&[before 3][first secondary ignorable]<<<x | |

* compare | |

= \x01 | |

<3 x | |

<3 y | |

<2 \u0301 | |

** test: canonical closure | |

@ rules | |

&X=A &U=Â | |

* compare | |

<1 U | |

= Â | |

= A\u0302 | |

<2 Ú # U with acute | |

= U\u0301 | |

= Ấ # A with circumflex & acute | |

= Â\u0301 | |

= A\u0302\u0301 | |

<1 X | |

= A | |

<2 X\u030A # with ring above | |

= Å | |

= A\u030A | |

= \u212B # Angstrom sign | |

@ rules | |

&x=\u5140\u55C0 | |

* compare | |

<1 x | |

= \u5140\u55C0 | |

= \u5140\uFA0D | |

= \uFA0C\u55C0 | |

= \uFA0C\uFA0D # CJK compatibility characters | |

<3 X | |

# canonical closure on prefix rules, ICU ticket 9444 | |

@ rules | |

&x=ä|ŝ | |

* compare | |

<1 äs # not tailored | |

<1 äx | |

= äŝ | |

= a\u0308s\u0302 | |

= a\u0308ŝ | |

= äs\u0302 | |

<3 äX | |

** test: conjoining Jamo map to expansions | |

@ rules | |

&gg=\u1101 # Jamo Lead consonant GG | |

&nj=\u11AC # Jamo Trail consonant NJ | |

* compare | |

<1 gg\u1161nj | |

= \u1101\u1161\u11AC | |

= \uAE4C\u11AC | |

= \uAE51 | |

<3 gg\u1161nJ | |

<1 \u1100\u1100 | |

** test: canonical tail closure, ICU ticket 5913 | |

@ rules | |

&a<â | |

* compare | |

<1 a | |

<1 â # tailored | |

= a\u0302 | |

<2 a\u0323\u0302 # discontiguous contraction | |

= ạ\u0302 # equivalent | |

= ậ # equivalent | |

<1 b | |

@ rules | |

&a<ạ | |

* compare | |

<1 a | |

<1 ạ # tailored | |

= a\u0323 | |

<2 a\u0323\u0302 # contiguous contraction plus extra diacritic | |

= ạ\u0302 # equivalent | |

= ậ # equivalent | |

<1 b | |

# Tail closure should work even if there is a prefix and/or contraction. | |

@ rules | |

&a<\u5140|câ | |

# In order to find discontiguous contractions for \u5140|câ | |

# there must exist a mapping for \u5140|ca, regardless of what it maps to. | |

# (This follows from the UCA spec.) | |

&x=\u5140|ca | |

* compare | |

<1 \u5140a | |

= \uFA0Ca | |

<1 \u5140câ # tailored | |

= \uFA0Ccâ | |

= \u5140ca\u0302 | |

= \uFA0Cca\u0302 | |

<2 \u5140ca\u0323\u0302 # discontiguous contraction | |

= \uFA0Cca\u0323\u0302 | |

= \u5140cạ\u0302 | |

= \uFA0Ccạ\u0302 | |

= \u5140cậ | |

= \uFA0Ccậ | |

<1 \u5140b | |

= \uFA0Cb | |

<1 \u5140x | |

= \u5140ca | |

# Double-check that without the extra mapping there will be no discontiguous match. | |

@ rules | |

&a<\u5140|câ | |

* compare | |

<1 \u5140a | |

= \uFA0Ca | |

<1 \u5140câ # tailored | |

= \uFA0Ccâ | |

= \u5140ca\u0302 | |

= \uFA0Cca\u0302 | |

<1 \u5140b | |

= \uFA0Cb | |

<1 \u5140ca\u0323\u0302 # no discontiguous contraction | |

= \uFA0Cca\u0323\u0302 | |

= \u5140cạ\u0302 | |

= \uFA0Ccạ\u0302 | |

= \u5140cậ | |

= \uFA0Ccậ | |

@ rules | |

&a<cạ | |

* compare | |

<1 a | |

<1 cạ # tailored | |

= ca\u0323 | |

<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic | |

= cạ\u0302 # equivalent | |

= cậ # equivalent | |

<1 b | |

# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | |

# = 03C9 0313 0300 0345 | |

# ccc = 0, 230, 230, 240 | |

@ rules | |

&δ=αῳ | |

# In order to find discontiguous contractions for αῳ | |

# there must exist a mapping for αω, regardless of what it maps to. | |

# (This follows from the UCA spec.) | |

&ε=αω | |

* compare | |

<1 δ | |

= αῳ | |

= αω\u0345 | |

<2 αω\u0313\u0300\u0345 # discontiguous contraction | |

= αὠ\u0300\u0345 | |

= αὢ\u0345 | |

= αᾢ | |

<2 αω\u0300\u0313\u0345 | |

= αὼ\u0313\u0345 | |

= αῲ\u0313 # not FCD | |

<1 ε | |

= αω | |

# Double-check that without the extra mapping there will be no discontiguous match. | |

@ rules | |

&δ=αῳ | |

* compare | |

<1 αω\u0313\u0300\u0345 # no discontiguous contraction | |

= αὠ\u0300\u0345 | |

= αὢ\u0345 | |

= αᾢ | |

<2 αω\u0300\u0313\u0345 | |

= αὼ\u0313\u0345 | |

= αῲ\u0313 # not FCD | |

<1 δ | |

= αῳ | |

= αω\u0345 | |

# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. | |

# Tests code paths where the tailored string has a combining mark | |

# that does not occur in any composite's decomposition. | |

@ rules | |

&δ=αὼ\u0315 | |

* compare | |

<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. | |

= αὠ\u0300\u0315 | |

= αὢ\u0315 | |

<1 δ | |

= αὼ\u0315 | |

= αω\u0300\u0315 | |

<2 αω\u0300\u0315\u0345 | |

= αὼ\u0315\u0345 | |

= αῲ\u0315 # not FCD | |

** test: danish a+a vs. a-umlaut, ICU ticket 9319 | |

@ rules | |

&z<aa | |

* compare | |

<1 z | |

<1 aa | |

<2 aa\u0308 | |

= aä | |

** test: Jamo L with and in prefix | |

# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). | |

@ rules | |

# Jamo Lead consonant G after G or GG | |

&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 | |

# Jamo Lead consonant GG sorts like G+G | |

&\u1100\u1100=\u1101 | |

# Note: Making G|GG and GG|GG sort the same as G|G+G | |

# would require the ability to reset on G|G+G, | |

# or we could make G-after-G equal to some secondary-CE character, | |

# and reset on a pair of those. | |

# (It does not matter much if there are at most two G in a row in real text.) | |

* compare | |

<1 \u1100 | |

<2 \u1100\u1100 # only one primary from a sequence of G lead consonants | |

= \u1101 | |

<2 \u1100\u1100\u1100 | |

= \u1101\u1100 | |

# but not = \u1100\u1101, see above | |

<1 \u1100\u1161 | |

= \uAC00 | |

<2 \u1100\u1100\u1161 | |

= \u1100\uAC00 # prefix match from the L of the LV syllable | |

= \u1101\u1161 | |

= \uAE4C | |

** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 | |

@ rules | |

# Low secondary CEs for Jamo V & T. | |

# Note: T should sort before V for proper syllable order. | |

&\u0332 # COMBINING LOW LINE (first primary ignorable) | |

<<\u1161<<\u1162 | |

# Korean Jamo lead consonant search rules, part 2: | |

# Make modern compound L jamo primary equivalent to non-compound forms. | |

# Secondary CEs for Jamo L-after-L, greater than Jamo V & T. | |

&\u0313 # COMBINING COMMA ABOVE (second primary ignorable) | |

=\u1100|\u1100 | |

=\u1103|\u1103 | |

=\u1107|\u1107 | |

=\u1109|\u1109 | |

=\u110C|\u110C | |

# Compound L Jamo map to equivalent expansions of primary+secondary CE. | |

&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK | |

&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT | |

&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP | |

&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS | |

&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC | |

* compare | |

<1 \u1100\u1161 | |

= \uAC00 | |

<2 \u1100\u1162 | |

= \uAC1C | |

<2 \u1100\u1100\u1161 | |

= \u1100\uAC00 | |

= \u1101\u1161 | |

= \uAE4C | |

<3 \u3132\u1161 | |

** test: Hangul syllables in prefix & in the interior of a contraction | |

@ rules | |

&x=\u1100\u1161|a\u1102\u1162z | |

* compare | |

<1 \u1100\u1161x | |

= \u1100\u1161a\u1102\u1162z | |

= \u1100\u1161a\uB0B4z | |

= \uAC00a\u1102\u1162z | |

= \uAC00a\uB0B4z | |

** test: digits are unsafe-backwards when numeric=on | |

@ root | |

% numeric=on | |

* compare | |

# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". | |

# We need to back up before the identical prefix "1" and compare the full numbers. | |

<1 11b | |

<1 101a | |

** test: simple locale data test | |

@ locale de | |

* compare | |

<1 a | |

<2 ä | |

<1 ae | |

<2 æ | |

@ locale de-u-co-phonebk | |

* compare | |

<1 a | |

<1 ae | |

<2 ä | |

<2 æ | |

# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. | |

** test: DataDrivenCollationTest/TestMorePinyin | |

# Testing the primary strength. | |

@ locale zh | |

% strength=primary | |

* compare | |

< lā | |

= lĀ | |

= Lā | |

= LĀ | |

< lān | |

= lĀn | |

< lē | |

= lĒ | |

= Lē | |

= LĒ | |

< lēn | |

= lĒn | |

** test: DataDrivenCollationTest/TestLithuanian | |

# Lithuanian sort order. | |

@ locale lt | |

* compare | |

< cz | |

< č | |

< d | |

< iz | |

< j | |

< sz | |

< š | |

< t | |

< zz | |

< ž | |

** test: DataDrivenCollationTest/TestLatvian | |

# Latvian sort order. | |

@ locale lv | |

* compare | |

< cz | |

< č | |

< d | |

< gz | |

< ģ | |

< h | |

< iz | |

< j | |

< kz | |

< ķ | |

< l | |

< lz | |

< ļ | |

< m | |

< nz | |

< ņ | |

< o | |

< rz | |

< ŗ | |

< s | |

< sz | |

< š | |

< t | |

< zz | |

< ž | |

** test: DataDrivenCollationTest/TestEstonian | |

# Estonian sort order. | |

@ locale et | |

* compare | |

< sy | |

< š | |

< šy | |

< z | |

< zy | |

< ž | |

< v | |

< w | |

< va | |

< õ | |

< õy | |

< ä | |

< äy | |

< ö | |

< öy | |

< ü | |

< üy | |

< x | |

** test: DataDrivenCollationTest/TestAlbanian | |

# Albanian sort order. | |

@ locale sq | |

* compare | |

< cz | |

< ç | |

< d | |

< dz | |

< dh | |

< e | |

< ez | |

< ë | |

< f | |

< gz | |

< gj | |

< h | |

< lz | |

< ll | |

< m | |

< nz | |

< nj | |

< o | |

< rz | |

< rr | |

< s | |

< sz | |

< sh | |

< t | |

< tz | |

< th | |

< u | |

< xz | |

< xh | |

< y | |

< zz | |

< zh | |

** test: DataDrivenCollationTest/TestSimplifiedChineseOrder | |

# Sorted file has different order. | |

@ root | |

# normalization=on turned on & off automatically. | |

* compare | |

< \u5F20 | |

< \u5F20\u4E00\u8E3F | |

** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash | |

# This pretty much crashes. | |

@ root | |

* compare | |

< \u0f71\u0f72\u0f80\u0f71\u0f72 | |

< \u0f80 | |

** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems | |

# These are examples of strings that caused trouble in partial sort key testing. | |

@ locale th-TH | |

* compare | |

< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C | |

< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 | |

* compare | |

< \u0E01\u0E07\u0E01\u0E32\u0E23 | |

< \u0E01\u0E07\u0E42\u0E01\u0E49 | |

* compare | |

< \u0E01\u0E23\u0E19\u0E17\u0E32 | |

< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 | |

* compare | |

< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 | |

< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 | |

* compare | |

< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D | |

< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 | |

** test: DataDrivenCollationTest/TestJavaStyleRule | |

# java.text allows rules to start as '<<<x<<<y...' | |

# we emulate this by assuming a &[first tertiary ignorable] in this case. | |

@ rules | |

&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b | |

* compare | |

= a | |

= equal | |

< z | |

< x | |

= b # x had become the new first primary ignorable | |

< w | |

** test: DataDrivenCollationTest/TestShiftedIgnorable | |

# The UCA states that primary ignorables should be completely | |

# ignorable when following a shifted code point. | |

@ root | |

% alternate=shifted | |

% strength=quaternary | |

* compare | |

< a\u0020b | |

= a\u0020\u0300b | |

= a\u0020\u0301b | |

< a_b | |

= a_\u0300b | |

= a_\u0301b | |

< A\u0020b | |

= A\u0020\u0300b | |

= A\u0020\u0301b | |

< A_b | |

= A_\u0300b | |

= A_\u0301b | |

< a\u0301b | |

< A\u0301b | |

< a\u0300b | |

< A\u0300b | |

** test: DataDrivenCollationTest/TestNShiftedIgnorable | |

# The UCA states that primary ignorables should be completely | |

# ignorable when following a shifted code point. | |

@ root | |

% alternate=non-ignorable | |

% strength=tertiary | |

* compare | |

< a\u0020b | |

< A\u0020b | |

< a\u0020\u0301b | |

< A\u0020\u0301b | |

< a\u0020\u0300b | |

< A\u0020\u0300b | |

< a_b | |

< A_b | |

< a_\u0301b | |

< A_\u0301b | |

< a_\u0300b | |

< A_\u0300b | |

< a\u0301b | |

< A\u0301b | |

< a\u0300b | |

< A\u0300b | |

** test: DataDrivenCollationTest/TestSafeSurrogates | |

# It turned out that surrogates were not skipped properly | |

# when iterating backwards if they were in the middle of a | |

# contraction. This test assures that this is fixed. | |

@ rules | |

&a < x\ud800\udc00b | |

* compare | |

< a | |

< x\ud800\udc00b | |

** test: DataDrivenCollationTest/da_TestPrimary | |

# This test goes through primary strength cases | |

@ locale da | |

% strength=primary | |

* compare | |

< Lvi | |

< Lwi | |

* compare | |

< L\u00e4vi | |

< L\u00f6wi | |

* compare | |

< L\u00fcbeck | |

= Lybeck | |

** test: DataDrivenCollationTest/da_TestTertiary | |

# This test goes through tertiary strength cases | |

@ locale da | |

% strength=tertiary | |

* compare | |

< Luc | |

< luck | |

* compare | |

< luck | |

< L\u00fcbeck | |

* compare | |

< lybeck | |

< L\u00fcbeck | |

* compare | |

< L\u00e4vi | |

< L\u00f6we | |

* compare | |

< L\u00f6ww | |

< mast | |

* compare | |

< A/S | |

< ANDRE | |

< ANDR\u00c9 | |

< ANDREAS | |

< AS | |

< CA | |

< \u00c7A | |

< CB | |

< \u00c7C | |

< D.S.B. | |

< DA | |

< \u00d0A | |

< DB | |

< \u00d0C | |

< DSB | |

< DSC | |

< EKSTRA_ARBEJDE | |

< EKSTRABUD0 | |

< H\u00d8ST | |

< HAAG | |

< H\u00c5NDBOG | |

< HAANDV\u00c6RKSBANKEN | |

< Karl | |

< karl | |

< NIELS\u0020J\u00d8RGEN | |

< NIELS-J\u00d8RGEN | |

< NIELSEN | |

< R\u00c9E,\u0020A | |

< REE,\u0020B | |

< R\u00c9E,\u0020L | |

< REE,\u0020V | |

< SCHYTT,\u0020B | |

< SCHYTT,\u0020H | |

< SCH\u00dcTT,\u0020H | |

< SCHYTT,\u0020L | |

< SCH\u00dcTT,\u0020M | |

< SS | |

< \u00df | |

< SSA | |

< STORE\u0020VILDMOSE | |

< STOREK\u00c6R0 | |

< STORM\u0020PETERSEN | |

< STORMLY | |

< THORVALD | |

< THORVARDUR | |

< \u00feORVAR\u00d0UR | |

< THYGESEN | |

< VESTERG\u00c5RD,\u0020A | |

< VESTERGAARD,\u0020A | |

< VESTERG\u00c5RD,\u0020B | |

< \u00c6BLE | |

< \u00c4BLE | |

< \u00d8BERG | |

< \u00d6BERG | |

* compare | |

< andere | |

< chaque | |

< chemin | |

< cote | |

< cot\u00e9 | |

< c\u00f4te | |

< c\u00f4t\u00e9 | |

< \u010du\u010d\u0113t | |

< Czech | |

< hi\u0161a | |

< irdisch | |

< lie | |

< lire | |

< llama | |

< l\u00f5ug | |

< l\u00f2za | |

< lu\u010d | |

< luck | |

< L\u00fcbeck | |

< lye | |

< l\u00e4vi | |

< L\u00f6wen | |

< m\u00e0\u0161ta | |

< m\u00eer | |

< myndig | |

< M\u00e4nner | |

< m\u00f6chten | |

< pi\u00f1a | |

< pint | |

< pylon | |

< \u0161\u00e0ran | |

< savoir | |

< \u0160erb\u016bra | |

< Sietla | |

< \u015blub | |

< subtle | |

< symbol | |

< s\u00e4mtlich | |

< verkehrt | |

< vox | |

< v\u00e4ga | |

< waffle | |

< wood | |

< yen | |

< yuan | |

< yucca | |

< \u017eal | |

< \u017eena | |

< \u017den\u0113va | |

< zoo0 | |

< Zviedrija | |

< Z\u00fcrich | |

< zysk0 | |

< \u00e4ndere | |

** test: DataDrivenCollationTest/hi_TestNewRules | |

# This test goes through new rules and tests against old rules | |

@ locale hi | |

* compare | |

< कॐ | |

< कं | |

< कँ | |

< कः | |

** test: DataDrivenCollationTest/ro_TestNewRules | |

# This test goes through new rules and tests against old rules | |

@ locale ro | |

* compare | |

< xAx | |

< xă | |

< xĂ | |

< Xă | |

< XĂ | |

< xăx | |

< xĂx | |

< xâ | |

< xÂ | |

< Xâ | |

< XÂ | |

< xâx | |

< xÂx | |

< xb | |

< xIx | |

< xî | |

< xÎ | |

< Xî | |

< XÎ | |

< xîx | |

< xÎx | |

< xj | |

< xSx | |

< xș | |

= xş | |

< xȘ | |

= xŞ | |

< Xș | |

= Xş | |

< XȘ | |

= XŞ | |

< xșx | |

= xşx | |

< xȘx | |

= xŞx | |

< xT | |

< xTx | |

< xț | |

= xţ | |

< xȚ | |

= xŢ | |

< Xț | |

= Xţ | |

< XȚ | |

= XŢ | |

< xțx | |

= xţx | |

< xȚx | |

= xŢx | |

< xU | |

** test: DataDrivenCollationTest/testOffsets | |

# This tests cases where forwards and backwards iteration get different offsets | |

@ locale en | |

% strength=tertiary | |

* compare | |

< a\uD800\uDC00\uDC00 | |

< b\uD800\uDC00\uDC00 | |

* compare | |

< \u0301A\u0301\u0301 | |

< \u0301B\u0301\u0301 | |

* compare | |

< abcd\r\u0301 | |

< abce\r\u0301 | |

# TODO: test offsets in new CollationTest | |

# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. | |

** test: was ICU 52 cmsccoll/TestRedundantRules | |

@ rules | |

& a < b < c < d& [before 1] c < m | |

* compare | |

<1 a | |

<1 b | |

<1 m | |

<1 c | |

<1 d | |

@ rules | |

& a < b <<< c << d <<< e& [before 3] e <<< x | |

* compare | |

<1 a | |

<1 b | |

<3 c | |

<2 d | |

<3 x | |

<3 e | |

@ rules | |

& a < b <<< c << d <<< e <<< f < g& [before 1] g < x | |

* compare | |

<1 a | |

<1 b | |

<3 c | |

<2 d | |

<3 e | |

<3 f | |

<1 x | |

<1 g | |

@ rules | |

& a <<< b << c < d& a < m | |

* compare | |

<1 a | |

<3 b | |

<2 c | |

<1 m | |

<1 d | |

@ rules | |

&a<b<<b\u0301 &z<b | |

* compare | |

<1 a | |

<1 b\u0301 | |

<1 z | |

<1 b | |

@ rules | |

&z<m<<<q<<<m | |

* compare | |

<1 z | |

<1 q | |

<3 m | |

@ rules | |

&z<<<m<q<<<m | |

* compare | |

<1 z | |

<1 q | |

<3 m | |

@ rules | |

& a < b < c < d& r < c | |

* compare | |

<1 a | |

<1 b | |

<1 d | |

<1 r | |

<1 c | |

@ rules | |

& a < b < c < d& c < m | |

* compare | |

<1 a | |

<1 b | |

<1 c | |

<1 m | |

<1 d | |

@ rules | |

& a < b < c < d& a < m | |

* compare | |

<1 a | |

<1 m | |

<1 b | |

<1 c | |

<1 d | |

** test: was ICU 52 cmsccoll/TestExpansionSyntax | |

# The following two rules should sort the particular list of strings the same. | |

@ rules | |

&AE <<< a << b <<< c &d <<< f | |

* compare | |

<1 AE | |

<3 a | |

<2 b | |

<3 c | |

<1 d | |

<3 f | |

@ rules | |

&A <<< a / E << b / E <<< c /E &d <<< f | |

* compare | |

<1 AE | |

<3 a | |

<2 b | |

<3 c | |

<1 d | |

<3 f | |

# The following two rules should sort the particular list of strings the same. | |

@ rules | |

&AE <<< a <<< b << c << d < e < f <<< g | |

* compare | |

<1 AE | |

<3 a | |

<3 b | |

<2 c | |

<2 d | |

<1 e | |

<1 f | |

<3 g | |

@ rules | |

&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g | |

* compare | |

<1 AE | |

<3 a | |

<3 b | |

<2 c | |

<2 d | |

<1 e | |

<1 f | |

<3 g | |

# The following two rules should sort the particular list of strings the same. | |

@ rules | |

&AE <<< B <<< C / D <<< F | |

* compare | |

<1 AE | |

<3 B | |

<3 F | |

<1 AED | |

<3 C | |

@ rules | |

&A <<< B / E <<< C / ED <<< F / E | |

* compare | |

<1 AE | |

<3 B | |

<3 F | |

<1 AED | |

<3 C | |

** test: never reorder trailing primaries | |

@ root | |

% reorder Zzzz Grek | |

* compare | |

<1 L | |

<1 字 | |

<1 Ω | |

<1 \uFFFD | |

<1 \uFFFF | |

** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes | |

@ rules | |

&u=ab|cd | |

&v=b|ce | |

* compare | |

<1 abc | |

<1 abcc | |

<1 abcf | |

<1 abcd | |

= abu | |

<1 abce | |

= abv | |

# With the following rules, there is only one prefix per composite ĉ or ç, | |

# but both prefixes apply to just c in NFD form. | |

# We would get different results for composed vs. NFD input | |

# if we fell back directly from longest-prefix mappings to no-prefix mappings. | |

@ rules | |

&x=op|ĉ | |

&y=p|ç | |

* compare | |

<1 opc | |

<2 opć | |

<1 opcz | |

<1 opd | |

<1 opĉ | |

= opc\u0302 | |

= opx | |

<1 opç | |

= opc\u0327 | |

= opy | |

# The mapping is used which has the longest matching prefix for which | |

# there is also a suffix match, with the longest suffix match among several for that prefix. | |

@ rules | |

&❶=d | |

&❷=de | |

&❸=def | |

&①=c|d | |

&②=c|de | |

&③=c|def | |

&④=bc|d | |

&⑤=bc|de | |

&⑥=bc|def | |

&⑦=abc|d | |

&⑧=abc|de | |

&⑨=abc|def | |

* compare | |

<1 9aadzz | |

= 9aa❶zz | |

<1 9aadez | |

= 9aa❷z | |

<1 9aadef | |

= 9aa❸ | |

<1 9acdzz | |

= 9ac①zz | |

<1 9acdez | |

= 9ac②z | |

<1 9acdef | |

= 9ac③ | |

<1 9bcdzz | |

= 9bc④zz | |

<1 9bcdez | |

= 9bc⑤z | |

<1 9bcdef | |

= 9bc⑥ | |

<1 abcdzz | |

= abc⑦zz | |

<1 abcdez | |

= abc⑧z | |

<1 abcdef | |

= abc⑨ | |

** test: prefix + discontiguous contraction with missing prefix contraction | |

# Unfortunate terminology: The first "prefix" here is the pre-context, | |

# the second "prefix" refers to the contraction/relation string that is | |

# one shorter than the one being tested. | |

@ rules | |

&x=p|e | |

&y=p|ê | |

&z=op|ê | |

# No mapping for op|e: | |

# Discontiguous contraction matching should not match op|ê in opệ | |

# because it would have to skip the dot below and extend a match on op|e by the circumflex, | |

# but there is no match on op|e. | |

* compare | |

<1 oPe | |

<1 ope | |

= opx | |

<1 opệ | |

= opy\u0323 # y not z | |

<1 opê | |

= opz | |

# We cannot test for fallback by whether the contraction default CE32 | |

# is for another contraction. With the following rules, there is no mapping for op|e, | |

# and the fallback to prefix p has no contractions. | |

@ rules | |

&x=p|e | |

&z=op|ê | |

* compare | |

<1 oPe | |

<1 ope | |

= opx | |

<2 opệ | |

= opx\u0323\u0302 # x not z | |

<1 opê | |

= opz | |

# One more variation: Fallback to the simple code point, no shorter non-empty prefix. | |

@ rules | |

&x=e | |

&z=op|ê | |

* compare | |

<1 ope | |

= opx | |

<3 oPe | |

= oPx | |

<2 opệ | |

= opx\u0323\u0302 # x not z | |

<1 opê | |

= opz | |

** test: maxVariable via rules | |

@ rules | |

[maxVariable space][alternate shifted] | |

* compare | |

= \u0020 | |

= \u000A | |

<1 . | |

<1 ° # degree sign | |

<1 $ | |

<1 0 | |

** test: maxVariable via setting | |

@ root | |

% maxVariable=currency | |

% alternate=shifted | |

* compare | |

= \u0020 | |

= \u000A | |

= . | |

= ° # degree sign | |

= $ | |

<1 0 | |

** test: ICU4J CollationMiscTest/TestContractionClosure (ää) | |

# This tests canonical closure, but it also tests that CollationFastLatin | |

# bails out properly for contractions with combining marks. | |

# For that we need pairs of strings that remain in the Latin fastpath | |

# long enough, hence the extra "= b" lines. | |

@ rules | |

&b=\u00e4\u00e4 | |

* compare | |

<1 b | |

= \u00e4\u00e4 | |

= b | |

= a\u0308a\u0308 | |

= b | |

= \u00e4a\u0308 | |

= b | |

= a\u0308\u00e4 | |

** test: ICU4J CollationMiscTest/TestContractionClosure (Å) | |

@ rules | |

&b=\u00C5 | |

* compare | |

<1 b | |

= \u00C5 | |

= b | |

= A\u030A | |

= b | |

= \u212B | |

** test: reset-before on already-tailored characters, ICU ticket 10108 | |

@ rules | |

&a<w<<x &[before 2]x<<y | |

* compare | |

<1 a | |

<1 w | |

<2 y | |

<2 x | |

@ rules | |

&a<<w<<<x &[before 2]x<<y | |

* compare | |

<1 a | |

<2 y | |

<2 w | |

<3 x | |

@ rules | |

&a<w<x &[before 2]x<<y | |

* compare | |

<1 a | |

<1 w | |

<1 y | |

<2 x | |

@ rules | |

&a<w<<<x &[before 2]x<<y | |

* compare | |

<1 a | |

<1 y | |

<2 w | |

<3 x | |

** test: numeric collation with other settings, ICU ticket 9092 | |

@ root | |

% strength=identical | |

% caseFirst=upper | |

% numeric=on | |

* compare | |

<1 100\u0020a | |

<1 101 | |

** test: collation type fallback from unsupported type, ICU ticket 10149 | |

@ locale fr-CA-u-co-phonebk | |

# Expect the same result as with fr-CA, using backwards-secondary order. | |

# That is, we should fall back from the unsupported collation type | |

# to the locale's default collation type. | |

* compare | |

<1 cote | |

<2 côte | |

<2 coté | |

<2 côté | |

** test: @ is equivalent to [backwards 2], ICU ticket 9956 | |

@ rules | |

&b<a @ &v<<w | |

* compare | |

<1 b | |

<1 a | |

<1 cote | |

<2 côte | |

<2 coté | |

<2 côté | |

<1 v | |

<2 w | |

<1 x | |

** test: shifted+reordering, ICU ticket 9507 | |

@ root | |

% reorder Grek punct space | |

% alternate=shifted | |

% strength=quaternary | |

# Which primaries are "variable" should be determined without script reordering, | |

# and then primaries should be reordered whether they are shifted to quaternary or not. | |

* compare | |

<4 ( # punctuation | |

<4 ) | |

<4 \u0020 # space | |

<1 ` # symbol | |

<1 ^ | |

<1 $ # currency symbol | |

<1 € | |

<1 0 # numbers | |

<1 ε # Greek | |

<1 e # Latin | |

<1 e(e | |

<4 e)e | |

<4 e\u0020e | |

<4 ee | |

<3 e(E | |

<4 e)E | |

<4 e\u0020E | |

<4 eE | |

** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 | |

@ rules | |

&\u0001<<<b<<<B | |

% caseFirst=upper | |

* compare | |

<1 aaa | |

<3 aaaB | |

** test: secondary+case ignores secondary ignorables, ICU ticket 9355 | |

@ rules | |

&\u0001<<<b<<<B | |

% strength=secondary | |

% caseLevel=on | |

* compare | |

<1 a | |

= ab | |

= aB | |

** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 | |

@ rules | |

&[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 | |

* compare | |

<1 ൗx | |

<2 ൌx | |

<1 ൗy | |

<2 ൌy | |

** test: quoted apostrophe in compact syntax, ICU ticket 8204 | |

@ rules | |

&q<<*a''c | |

* compare | |

<1 d | |

<1 p | |

<1 q | |

<2 a | |

<2 \u0027 | |

<2 c | |

<1 r | |

# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" | |

** test: locale -u- with collation keywords, ICU ticket 8260 | |

@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 | |

* compare | |

<4 \u0020 # space is shifted, strength=quaternary | |

<1 ! # punctuation is regular | |

<1 2 | |

<1 12 # numeric sorting | |

<1 B | |

<c b # uppercase first on case level | |

<1 x\u0301\u0308 | |

<2 x\u0308\u0301 # normalization off | |

** test: locale @ with collation keywords, ICU ticket 8260 | |

@ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted | |

* compare | |

<4 $ # currency symbols are shifted, strength=quaternary | |

<1 àla | |

<2 alà # backwards secondary level | |

** test: locale -u- with script reordering, ICU ticket 8260 | |

@ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai | |

* compare | |

<1 \u0020 | |

<1 あ | |

<1 ☂ | |

<1 Ω | |

<1 丂 | |

<1 ж | |

<1 L | |

<1 4 | |

<1 Ձ | |

<1 अ | |

<1 ሄ | |

<1 ฉ | |

** test: locale @collation=type should be case-insensitive | |

@ locale de@coLLation=PhoneBook | |

* compare | |

<1 ae | |

<2 ä | |

<3 Ä |