blob: b8341c5a64f25d64b4406dd4dd3a44158677489a [file] [log] [blame]
#!/bin/sh
set -e
D="$(dirname "$0")"
# Convenience function for checking that a command exists.
requires() {
cmd="$1"
if ! command -v "$cmd" > /dev/null 2>&1; then
echo "DEPENDENCY MISSING: $cmd must be installed" >&2
exit 1
fi
}
# Test if an array ($2) contains a particular element ($1).
array_exists() {
needle="$1"
shift
for el in "$@"; do
if [ "$el" = "$needle" ]; then
return 0
fi
done
return 1
}
graphemes() {
regex="$(sh "$D/regex/grapheme.sh")"
echo "generating forward grapheme DFA"
ucd-generate dfa \
--name GRAPHEME_BREAK_FWD \
--sparse --minimize --anchored --state-size 2 \
src/unicode/fsm/ \
"$regex"
echo "generating reverse grapheme DFA"
ucd-generate dfa \
--name GRAPHEME_BREAK_REV \
--reverse --longest \
--sparse --minimize --anchored --state-size 2 \
src/unicode/fsm/ \
"$regex"
}
words() {
regex="$(sh "$D/regex/word.sh")"
echo "generating forward word DFA (this can take a while)"
ucd-generate dfa \
--name WORD_BREAK_FWD \
--sparse --minimize --anchored --state-size 4 \
src/unicode/fsm/ \
"$regex"
}
sentences() {
regex="$(sh "$D/regex/sentence.sh")"
echo "generating forward sentence DFA (this can take a while)"
ucd-generate dfa \
--name SENTENCE_BREAK_FWD \
--minimize \
--sparse --anchored --state-size 4 \
src/unicode/fsm/ \
"$regex"
}
regional_indicator() {
# For finding all occurrences of region indicators. This is used to handle
# regional indicators as a special case for the reverse grapheme iterator
# and the reverse word iterator.
echo "generating regional indicator DFA"
ucd-generate dfa \
--name REGIONAL_INDICATOR_REV \
--reverse \
--classes --minimize --anchored --premultiply --state-size 1 \
src/unicode/fsm/ \
"\p{gcb=Regional_Indicator}"
}
simple_word() {
echo "generating forward simple word DFA"
ucd-generate dfa \
--name SIMPLE_WORD_FWD \
--sparse --minimize --state-size 2 \
src/unicode/fsm/ \
"\w"
}
whitespace() {
echo "generating forward whitespace DFA"
ucd-generate dfa \
--name WHITESPACE_ANCHORED_FWD \
--anchored --classes --premultiply --minimize --state-size 1 \
src/unicode/fsm/ \
"\s+"
echo "generating reverse whitespace DFA"
ucd-generate dfa \
--name WHITESPACE_ANCHORED_REV \
--reverse \
--anchored --classes --premultiply --minimize --state-size 2 \
src/unicode/fsm/ \
"\s+"
}
main() {
if array_exists "-h" "$@" || array_exists "--help" "$@"; then
echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
exit
fi
commands="
graphemes
sentences
words
regional-indicator
simple-word
whitespace
"
if array_exists "--list-commands" "$@"; then
for cmd in $commands; do
echo "$cmd"
done
exit
fi
# ucd-generate is used to compile regexes into DFAs.
requires ucd-generate
mkdir -p src/unicode/fsm/
cmds=$*
if [ $# -eq 0 ] || array_exists "all" "$@"; then
cmds=$commands
fi
for cmd in $cmds; do
if array_exists "$cmd" $commands; then
fun="$(echo "$cmd" | sed 's/-/_/g')"
eval "$fun"
else
echo "unrecognized command: $cmd" >&2
fi
done
}
main "$@"