From ea0f913dfbc9549f1ea2dd20f11470c423ac3aeb Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 22 Jan 2024 04:29:49 +0100 Subject: [PATCH] [Template merge] src/fst reorg --- .gitignore | 26 +- m4/giella-config-files.m4 | 19 +- m4/giella-macros.m4 | 10 +- src/filters/Makefile.am.orig | 69 - src/fst/Makefile.am | 1282 +++++++++++++--- src/{ => fst}/filters/.gitignore | 0 src/{ => fst}/filters/Makefile.am | 18 +- .../filters/block-compounds.est.xfscript | 0 .../filters/block-derivations.est.xfscript | 0 ...wncase-derived_proper-strings.est.xfscript | 0 .../filters/downcase_UCletters.regex | 0 .../filters/evaluate-flags.est.xfscript | 0 .../filters/modify-derivations.est.xfscript | 0 .../filters/numeral-filter.est.xfscript | 0 src/{ => fst}/filters/remove-DNorm-tags.regex | 0 .../remove-NotNorm-wordforms.est.xfscript | 0 .../remove-derivation-position-tags.regex | 0 .../filters/remove-guessed-forms.est.xfscript | 0 .../filters/remove-non-gi-forms.est.xfscript | 0 .../filters/remove-norm-comp-tags.regex | 0 .../filters/remove-nospell-words.est.xfscript | 0 .../filters/remove-pl-forms.est.xfscript | 0 .../filters/remove-sg-forms.est.xfscript | 0 .../filters/remove-sg-nom-forms.est.xfscript | 0 .../filters/remove-usage-tags.est.xfscript | 0 .../filters/rename-POS_before_Der-tags.regex | 0 .../filters/reorder-tags.est.xfscript | 0 .../filters/upcase-guessed-names.est.xfscript | 0 ...upcase-guessed_proper-strings.est.xfscript | 0 .../filters/wordpair-filter.est.xfscript | 0 src/fst/morphology/Makefile.am | 202 +++ .../Makefile.modifications-local.am | 20 + .../morphology/Makefile.modifications-phon.am | 26 + .../affixes/exceptional_declinations.lexc | 0 src/fst/{ => morphology}/affixes/gi.lexc | 0 .../affixes/regular_declinations.lexc | 0 src/fst/{ => morphology}/affixes/verbs.lexc | 0 .../generated_files/00README.txt | 0 .../{ => morphology}/incoming/00README.txt | 0 src/fst/{ => morphology}/phonology.twolc | 0 src/fst/{ => morphology}/root.lexc | 0 .../{ => morphology}/stems/abbreviations.lexc | 0 src/fst/{ => morphology}/stems/acronyms.lexc | 0 .../{ => morphology}/stems/adjectives.lexc | 0 .../{ => morphology}/stems/adpositions.lexc | 0 src/fst/{ => morphology}/stems/adverbs.lexc | 0 .../stems/cardinalnumerals.lexc | 0 .../stems/comparative_adjectives.lexc | 0 .../{ => morphology}/stems/conjunctions.lexc | 0 .../stems/final_components.lexc | 0 .../stems/genitive_attributes.lexc | 0 .../{ => morphology}/stems/interjections.lexc | 0 .../stems/noninflecting_adjectives.lexc | 0 .../stems/noninflecting_verbs.lexc | 0 src/fst/{ => morphology}/stems/nouns.lexc | 0 src/fst/{ => morphology}/stems/numbers.lexc | 0 .../stems/ordinalnumerals.lexc | 0 src/fst/{ => morphology}/stems/prefixes.lexc | 0 src/fst/{ => morphology}/stems/pronouns.lexc | 0 .../{ => morphology}/stems/propernouns.lexc | 0 .../stems/superlative_adjectives.lexc | 0 .../stems/symbol_strings.lexc | 0 src/fst/{ => morphology}/stems/verbs.lexc | 0 src/{ => fst}/orthography/Makefile.am | 3 +- src/{ => fst}/orthography/allcaps.xfscript | 0 .../downcase-derived_proper-strings.xfscript | 0 src/{ => fst}/orthography/inituppercase.regex | 0 src/{ => fst}/orthography/punctrelax.xfscript | 0 .../spellrelax-mobile-keyboard.regex | 0 .../orthography/spellrelax-tags.regex | 0 .../orthography/spellrelax-with-tags.xfscript | 0 src/{ => fst}/orthography/spellrelax.regex | 0 src/{ => fst}/phonetics/Makefile.am | 0 src/{ => fst}/phonetics/tests/Makefile.am | 0 src/{ => fst}/phonetics/tests/run_tests.sh.in | 0 src/fst/phonetics/tests/tests/Makefile.am | 19 + src/fst/phonetics/tests/tests/run_tests.sh.in | 89 ++ src/{ => fst}/phonetics/txt2ipa.xfscript | 0 .../syllabification}/Makefile.am | 0 .../syllabification}/hyphenation.xfscript | 0 src/{ => fst}/tagsets/Makefile.am | 0 src/{ => fst}/transcriptions/Makefile.am | 0 .../transcriptor-abbrevs2text.lexc | 0 .../transcriptor-clock-digit2text.lexc | 0 .../transcriptor-date-digit2text.lexc | 0 .../transcriptor-numbers-digit2text.lexc | 0 .../generate-propernoun-lemmas.sh.in | 2 +- .../desktop/hfst/accept-all-lemmas.sh.in | 2 +- tools/spellcheckers/soovita.cpp.utf8.1 | 1302 ++++++++--------- 89 files changed, 2129 insertions(+), 960 deletions(-) delete mode 100644 src/filters/Makefile.am.orig rename src/{ => fst}/filters/.gitignore (100%) rename src/{ => fst}/filters/Makefile.am (64%) rename src/{ => fst}/filters/block-compounds.est.xfscript (100%) rename src/{ => fst}/filters/block-derivations.est.xfscript (100%) rename src/{ => fst}/filters/downcase-derived_proper-strings.est.xfscript (100%) rename src/{ => fst}/filters/downcase_UCletters.regex (100%) rename src/{ => fst}/filters/evaluate-flags.est.xfscript (100%) rename src/{ => fst}/filters/modify-derivations.est.xfscript (100%) rename src/{ => fst}/filters/numeral-filter.est.xfscript (100%) rename src/{ => fst}/filters/remove-DNorm-tags.regex (100%) rename src/{ => fst}/filters/remove-NotNorm-wordforms.est.xfscript (100%) rename src/{ => fst}/filters/remove-derivation-position-tags.regex (100%) rename src/{ => fst}/filters/remove-guessed-forms.est.xfscript (100%) rename src/{ => fst}/filters/remove-non-gi-forms.est.xfscript (100%) rename src/{ => fst}/filters/remove-norm-comp-tags.regex (100%) rename src/{ => fst}/filters/remove-nospell-words.est.xfscript (100%) rename src/{ => fst}/filters/remove-pl-forms.est.xfscript (100%) rename src/{ => fst}/filters/remove-sg-forms.est.xfscript (100%) rename src/{ => fst}/filters/remove-sg-nom-forms.est.xfscript (100%) rename src/{ => fst}/filters/remove-usage-tags.est.xfscript (100%) rename src/{ => fst}/filters/rename-POS_before_Der-tags.regex (100%) rename src/{ => fst}/filters/reorder-tags.est.xfscript (100%) rename src/{ => fst}/filters/upcase-guessed-names.est.xfscript (100%) rename src/{ => fst}/filters/upcase-guessed_proper-strings.est.xfscript (100%) rename src/{ => fst}/filters/wordpair-filter.est.xfscript (100%) create mode 100644 src/fst/morphology/Makefile.am create mode 100644 src/fst/morphology/Makefile.modifications-local.am create mode 100644 src/fst/morphology/Makefile.modifications-phon.am rename src/fst/{ => morphology}/affixes/exceptional_declinations.lexc (100%) rename src/fst/{ => morphology}/affixes/gi.lexc (100%) rename src/fst/{ => morphology}/affixes/regular_declinations.lexc (100%) rename src/fst/{ => morphology}/affixes/verbs.lexc (100%) rename src/fst/{ => morphology}/generated_files/00README.txt (100%) rename src/fst/{ => morphology}/incoming/00README.txt (100%) rename src/fst/{ => morphology}/phonology.twolc (100%) rename src/fst/{ => morphology}/root.lexc (100%) rename src/fst/{ => morphology}/stems/abbreviations.lexc (100%) rename src/fst/{ => morphology}/stems/acronyms.lexc (100%) rename src/fst/{ => morphology}/stems/adjectives.lexc (100%) rename src/fst/{ => morphology}/stems/adpositions.lexc (100%) rename src/fst/{ => morphology}/stems/adverbs.lexc (100%) rename src/fst/{ => morphology}/stems/cardinalnumerals.lexc (100%) rename src/fst/{ => morphology}/stems/comparative_adjectives.lexc (100%) rename src/fst/{ => morphology}/stems/conjunctions.lexc (100%) rename src/fst/{ => morphology}/stems/final_components.lexc (100%) rename src/fst/{ => morphology}/stems/genitive_attributes.lexc (100%) rename src/fst/{ => morphology}/stems/interjections.lexc (100%) rename src/fst/{ => morphology}/stems/noninflecting_adjectives.lexc (100%) rename src/fst/{ => morphology}/stems/noninflecting_verbs.lexc (100%) rename src/fst/{ => morphology}/stems/nouns.lexc (100%) rename src/fst/{ => morphology}/stems/numbers.lexc (100%) rename src/fst/{ => morphology}/stems/ordinalnumerals.lexc (100%) rename src/fst/{ => morphology}/stems/prefixes.lexc (100%) rename src/fst/{ => morphology}/stems/pronouns.lexc (100%) rename src/fst/{ => morphology}/stems/propernouns.lexc (100%) rename src/fst/{ => morphology}/stems/superlative_adjectives.lexc (100%) rename src/fst/{ => morphology}/stems/symbol_strings.lexc (100%) rename src/fst/{ => morphology}/stems/verbs.lexc (100%) rename src/{ => fst}/orthography/Makefile.am (93%) rename src/{ => fst}/orthography/allcaps.xfscript (100%) rename src/{ => fst}/orthography/downcase-derived_proper-strings.xfscript (100%) rename src/{ => fst}/orthography/inituppercase.regex (100%) rename src/{ => fst}/orthography/punctrelax.xfscript (100%) rename src/{ => fst}/orthography/spellrelax-mobile-keyboard.regex (100%) rename src/{ => fst}/orthography/spellrelax-tags.regex (100%) rename src/{ => fst}/orthography/spellrelax-with-tags.xfscript (100%) rename src/{ => fst}/orthography/spellrelax.regex (100%) rename src/{ => fst}/phonetics/Makefile.am (100%) rename src/{ => fst}/phonetics/tests/Makefile.am (100%) rename src/{ => fst}/phonetics/tests/run_tests.sh.in (100%) create mode 100644 src/fst/phonetics/tests/tests/Makefile.am create mode 100644 src/fst/phonetics/tests/tests/run_tests.sh.in rename src/{ => fst}/phonetics/txt2ipa.xfscript (100%) rename src/{hyphenation => fst/syllabification}/Makefile.am (100%) rename src/{hyphenation => fst/syllabification}/hyphenation.xfscript (100%) rename src/{ => fst}/tagsets/Makefile.am (100%) rename src/{ => fst}/transcriptions/Makefile.am (100%) rename src/{ => fst}/transcriptions/transcriptor-abbrevs2text.lexc (100%) rename src/{ => fst}/transcriptions/transcriptor-clock-digit2text.lexc (100%) rename src/{ => fst}/transcriptions/transcriptor-date-digit2text.lexc (100%) rename src/{ => fst}/transcriptions/transcriptor-numbers-digit2text.lexc (100%) diff --git a/.gitignore b/.gitignore index b2355070..b3d95092 100644 --- a/.gitignore +++ b/.gitignore @@ -61,19 +61,19 @@ /src/cg3/functions.cg3 /src/cg3/generated-tag-list.cg3 /src/cg3/valency-postspell.cg3 -/src/filters/*-tags.txt -/src/filters/*area-*.regex -/src/filters/remove-all*.regex -/src/filters/remove-homonymy-tags.regex -/src/filters/remove-usage-tags.regex -/src/fst/*-error-log.txt -/src/fst/*.tmp.* +/src/fst/filters/*-tags.txt +/src/fst/filters/*area-*.regex +/src/fst/filters/remove-all*.regex +/src/fst/filters/remove-homonymy-tags.regex +/src/fst/filters/remove-usage-tags.regex +/src/fst/morphology/*-error-log.txt +/src/fst/morphology/*.tmp.* /src/fst/generated_files/*.lexc -/src/fst/lexicon* -/src/fst/url.lexc -/src/orthography/*-nfc2nfd.* -/src/orthography/*-nfd2nfc.* -/src/phonetics/tests/*.sh +/src/fst/morphology/lexicon* +/src/fst/morphology/url.lexc +/src/fst/orthography/*-nfc2nfd.* +/src/fst/orthography/*-nfd2nfc.* +/src/fst/phonetics/tests/*.sh /test/run-morph-tester.sh /test/run-yaml-testcases.sh /test/src/morphology/all*.txt @@ -148,3 +148,5 @@ Makefile.in build bygg generated* +.deps +.generated diff --git a/m4/giella-config-files.m4 b/m4/giella-config-files.m4 index c9670e2b..8745b665 100644 --- a/m4/giella-config-files.m4 +++ b/m4/giella-config-files.m4 @@ -8,15 +8,16 @@ AC_CONFIG_FILES([Makefile \ giella-est.pc \ manifest.toml \ src/Makefile \ - src/filters/Makefile \ - src/hyphenation/Makefile \ + src/fst/filters/Makefile \ + src/fst/syllabification/Makefile \ src/fst/Makefile \ - src/orthography/Makefile \ - src/phonetics/Makefile \ - src/phonetics/tests/Makefile \ + src/fst/morphology/Makefile \ + src/fst/orthography/Makefile \ + src/fst/phonetics/Makefile \ + src/fst/phonetics/tests/Makefile \ src/cg3/Makefile \ - src/tagsets/Makefile \ - src/transcriptions/Makefile \ + src/fst/tagsets/Makefile \ + src/fst/transcriptions/Makefile \ docs/Makefile \ test/Makefile \ test/tools/Makefile \ @@ -64,8 +65,8 @@ AC_CONFIG_FILES([Makefile \ # Add one AC_CONFIG_FILES for each script file that needs processing. This gives # the most pleasant user experience and most readable autoconf code to maintain. # Spell checker tests, all languages: -AC_CONFIG_FILES([src/phonetics/tests/run_tests.sh], - [chmod a+x src/phonetics/tests/run_tests.sh]) +AC_CONFIG_FILES([src/fst/phonetics/tests/run_tests.sh], + [chmod a+x src/fst/phonetics/tests/run_tests.sh]) AC_CONFIG_FILES([test/tools/spellcheckers/test-zhfst-file.sh], \ [chmod a+x test/tools/spellcheckers/test-zhfst-file.sh]) AC_CONFIG_FILES([test/tools/spellcheckers/fstbased/desktop/hfst/test-zhfst-basic-sugg-speed.sh], \ diff --git a/m4/giella-macros.m4 b/m4/giella-macros.m4 index 2d7500c5..99092492 100644 --- a/m4/giella-macros.m4 +++ b/m4/giella-macros.m4 @@ -88,7 +88,7 @@ AC_MSG_RESULT([$GIELLA_CORE]) ############################################################### ### This is the version of the Giella Core that we require. ### ### UPDATE AS NEEDED. -_giella_core_min_version=0.20.1 +_giella_core_min_version=0.21.0 # GIELLA_CORE/GTCORE env. variable, required by the infrastructure to find scripts: AC_ARG_VAR([GIELLA_CORE], [directory for the Giella infra core scripts and other required resources]) @@ -845,9 +845,9 @@ AC_ARG_ENABLE([abbr], [enable_abbr=$enableval], [enable_abbr=no]) AS_IF([test x$enable_abbr != xno -a \ - "$(find ${srcdir}/src/fst/stems/ -name "abbreviations.lexc" | head -n 1)" = "" ], + "$(find ${srcdir}/src/fst/morphology/stems/ -name "abbreviations.lexc" | head -n 1)" = "" ], [AC_MSG_ERROR([You asked for abbr.txt generation, but have no file \ -src/fst/stems/abbreviations.lexc])]) +src/fst/morphoogy/stems/abbreviations.lexc])]) AS_IF([test x$enable_abbr = xyes -a x$enable_generators = xno], [AC_MSG_ERROR([You need to enable generators to build the abbr file])]) AM_CONDITIONAL([WANT_ABBR], [test "x$enable_abbr" != xno]) @@ -1002,7 +1002,7 @@ To build, test and install: make install EOF AS_IF([test x$gt_prog_xslt = xno -a \ - "$(find ${srcdir}/src/fst/stems -name "*.xml" | head -n 1)" != "" ], + "$(find ${srcdir}/src/fst/morphology/stems -name "*.xml" | head -n 1)" != "" ], [AC_MSG_WARN([You have XML source files, but XML transformation to LexC is disabled. Please check the output of configure to locate any problems. The LexC files will still compile though. @@ -1048,5 +1048,7 @@ cd .. git clone git@github.com:giellalt/$gt_SHARED_FAILS cd $gt_SHARED_FAILS ./autogen.sh && ./configure && make])]) +AC_MSG_WARN([January 2024: the lexc files and fsts have been moved up to src/fst/morphology]) ]) # gt_PRINT_FOOTER + # vim: set ft=config: diff --git a/src/filters/Makefile.am.orig b/src/filters/Makefile.am.orig deleted file mode 100644 index 2eb6ed77..00000000 --- a/src/filters/Makefile.am.orig +++ /dev/null @@ -1,69 +0,0 @@ -## Process this file with automake to produce Makefile.in - -## Copyright (C) 2011 Samediggi - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - -######################################################### -############## BEGIN: Local modifications ############### - -# List any local filter regex files here: -GIELLA_FILTER_LOCAL_REGEX_SRCS=\ - downcase_UCletters.regex - -# List any local filter xfscript files here: -GIELLA_FILTER_LOCAL_XFSCRIPT_SRCS=reorder-tags.est.xfscript \ - remove-sg-forms.est.xfscript \ - remove-pl-forms.est.xfscript \ - remove-sg-nom-forms.est.xfscript \ - remove-non-gi-forms.est.xfscript \ - remove-usage-tags.est.xfscript \ - remove-nospell-words.est.xfscript \ - remove-NotNorm-wordforms.est.xfscript \ - modify-derivations.est.xfscript \ - block-derivations.est.xfscript \ - block-compounds.est.xfscript \ - wordpair-filter.est.xfscript \ - numeral-filter.est.xfscript \ - evaluate-flags.est.xfscript \ - downcase-derived_proper-strings.est.xfscript \ - upcase-guessed-names.est.xfscript \ - remove-guessed-forms.est.xfscript - -# List any local filter lexc files here: -GIELLA_FILTER_LOCAL_LEXC_SRCS= - -# List any locally generated regex source files here: -GIELLA_FILTER_LOCAL_GENERATED_REGEX_SRCS= - -# List any locally generated xfscript source files here: -GIELLA_FILTER_LOCAL_GENERATED_XFSCRIPT_SRCS= - -# List any locally generated lexc source files here: -GIELLA_FILTER_LOCAL_GENERATED_LEXC_SRCS= - -# List any additional source files here, so that they are included in the dist. -# Source files that are not directly compiled to fst's but are instead used as -# part of a local build step should be listed here. -EXTRA_SRCS= - -########## Add local build rules below here: ############ - -############### END: Local modifications ################ -######################################################### - -# Included build file, where the actual build instructions are: -include $(top_srcdir)/../giella-core/am-shared/src-filters-dir-include.am - -# vim: set ft=automake: diff --git a/src/fst/Makefile.am b/src/fst/Makefile.am index f0f39a5b..d599b796 100644 --- a/src/fst/Makefile.am +++ b/src/fst/Makefile.am @@ -1,197 +1,1091 @@ ## Process this file with automake to produce Makefile.in +## Copyright: Sámediggi/Divvun/UiT +## Licence: GPL v3+ -## Copyright (C) 2011 Samediggi - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - -# Add language-specific flags for hfst-lexc compilation here: -if HAVE_SHARED_COMMON -HFST_LEXC_LOCAL_FLAGS= # --Werror # uncomment if lexc is good enough -else -HFST_LEXC_LOCAL_FLAGS= # No --Werror if deps are missing ! -endif - - -####### Morphology source file defs: ######## - -# Set this to name of lexc file containing Multichar_Symbols and LEXICON Root -GT_LEXC_ROOT=$(srcdir)/root.lexc - -# Set this to the names of all regular lexc source files: -GT_LEXC_SRCS_L1_L2=\ - stems/abbreviations.lexc \ - stems/adjectives.lexc \ - stems/noninflecting_adjectives.lexc \ - stems/comparative_adjectives.lexc \ - stems/superlative_adjectives.lexc \ - stems/adpositions.lexc \ - stems/adverbs.lexc \ - stems/conjunctions.lexc \ - stems/genitive_attributes.lexc \ - stems/interjections.lexc \ - stems/nouns.lexc \ - stems/cardinalnumerals.lexc \ - stems/ordinalnumerals.lexc \ - stems/pronouns.lexc \ - stems/propernouns.lexc \ - stems/verbs.lexc \ - stems/noninflecting_verbs.lexc \ - stems/prefixes.lexc \ - stems/final_components.lexc \ - stems/numbers.lexc \ - stems/acronyms.lexc \ - stems/symbol_strings.lexc \ - affixes/regular_declinations.lexc \ - affixes/exceptional_declinations.lexc \ - affixes/verbs.lexc \ - affixes/gi.lexc - - -# If you are building an error-detecting L2 analyser, specify the lexc files -# that differ between the regular L1 and the L2 analysers below, in L1 and -# L2 respectively. L2 files must end in "*-L2.lexc". See SME for an example. -L1= - -L2= - -GT_LEXC_SRCS=\ - $(GT_LEXC_SRCS_L1_L2) \ - $(L1) - -GT_LEXC_L2_SRCS=\ - $(GT_LEXC_SRCS_L1_L2) \ - $(L2) - -# Set this to the names of all generated lexc files, if any -GENERATED_LEXC_SRCS=generated_files/mul-$(GLANG)-punctuation.lexc \ - generated_files/mul-$(GLANG)-symbols.lexc - -# change handling of shared lexical data here: -if HAVE_SHARED_COMMON -url.tmp.lexc: $(gt_SHARED_common)/src/fst/url.lexc - $(AM_V_CP)cp -f $< $@ - -generated_files/mul-$(GLANG)-%.lexc: $(gt_SHARED_common)/src/fst/stems/%.lexc - $(AM_V_at)$(MKDIR_P) generated_files - $(AM_V_CP)cp -f $< $@ -else -# this is "safe" fallback (compiles but you miss everything) -url.tmp.lexc: - echo "LEXICON Root" > $@ - echo "< h t t p (s) %: %/ %/ ?*> # ;" >> $@ - -generated_files/mul-$(GLANG)-%.lexc: - $(AM_V_at)$(MKDIR_P) generated_files - echo "! Missing shared common data" > $@ -endif -# add other lexical shared data handling here - -# Set this to the names of all source xml files, if any -GT_XML_SRCS= - -# Define any additional lexc sources here (compiled on their own): -GT_LOCAL_SRCS=\ - pair_initial.tmp.lexc \ - pair_final.tmp.lexc \ - num_initial.tmp.lexc \ - num_final.tmp.lexc \ - abbrevdot.tmp.lexc \ - guesser-simplex-nouns.tmp.lexc \ - guesser-names.tmp0.lexc - -# guesser-derivations.tmp.lexc -# punctuation.tmp.lexc -# acronyms.tmp.lexc - -# Define local xfscripts here: -GT_LOCAL_XFSCRIPT_SRCS=\ - emoticon.xfscript - -# Define here any additional sources just included in the distro: -GT_DISTRO_SRCS= - -### BEGIN: Local processing: ### -EST_AFFIX_FILES=$(srcdir)/affixes/regular_declinations \ - $(srcdir)/affixes/exceptional_declinations \ - $(srcdir)/affixes/verbs \ - $(srcdir)/affixes/gi - -# make the parts inflect -# by re-using root.lexc and affixes, and omitting all the stem lexicons -# ... and remove the flag diacritics from initial parts (why? because otherwise the filter that -# puts the initial and final part together doesn't work ?) +# always build . last here, and tagsets have to be built after morphology +SUBDIRS = morphology filters phonetics syllabification orthography transcriptions tagsets . + +####### Automake targets: ######## + +# Define target variables first, before assigning to them: +GT_ANALYSERS= +GT_GENERATORS= +CUSTOM_FSTS= + +#### Local modifications in *fst processing: #### +#### +#### Copy the fallback targets, and rename them to the desired targets. Then: +#### Replace the 'cp' command (Xerox) / Prepend the hfst-invert command (Hfst - +#### remember to move the $<) with whatever you need to complete +#### the processing to get the final target transducer. +#### Remember to add the dependencies as well. +#### Also make sure that HFST and Xerox processing are the same. +#### +#### If you add new transducers to be built, you need to add them to the +#### relevant variable, e.g.: +#### +#### if CAN_HFST +#### GT_GENERATORS+=generator-oahpa-gt-norm.hfst +#### endif +#### +#### NB!!!! The HFST targets should get a hyphen after 'analyser'/'generator' +#### respectively, to make the local targets minimally different from and +#### slightly more specific than the fallback targets. This is to avoid warnings +#### about duplicate targets. That is, the local targets should looke like: +#### +#### analyser-%.hfst: analyser-%.tmp.hfst +#### generator-%.hfst: generator-%.tmp.hfst + +################################################################## +#### BEGIN: Add local processing instructions BELOW this line #### +################################################################## + +######################################################## +#### Add language-specific transducer targets here: #### + +#### Xerox transducers: +if CAN_XFST +GT_ANALYSERS+=analyser-gt-desc.xfst \ + analyser-gt-norm.xfst \ + analyser-disamb-gt-desc.xfst +GT_GENERATORS+=generator-gt-desc.xfst \ + generator-gt-norm.xfst + +if WANT_CUSTOM_FSTS +CUSTOM_FSTS+= +endif # WANT_CUSTOM_FSTS + +endif # CAN_XFST + +#### HFST transducers +if CAN_HFST +GT_ANALYSERS+=analyser-gt-desc.hfst \ + analyser-gt-norm.hfst \ + analyser-gt-desc.hfst \ + analyser-gt-descguess.hfst \ + analyser-gt-guess.hfst \ + analyser-disamb-gt-desc.hfst +GT_GENERATORS+=generator-gt-desc.hfst \ + generator-gt-norm.hfst \ + generator-gt-desc.hfst \ + generator-gt-descguess.hfst \ + generator-gt-guess.hfst + +if WANT_CUSTOM_FSTS +CUSTOM_FSTS+= +endif # WANT_CUSTOM_FSTS + +endif # CAN_HFST + +#### FOMA transducers +if CAN_FOMA +GT_ANALYSERS+= +GT_GENERATORS+= + +if WANT_CUSTOM_FSTS +CUSTOM_FSTS+= +endif # WANT_CUSTOM_FSTS + +endif # CAN_FOMA + +################################################# +#### Add language-specific build rules here: #### + +EST_EXTRA_PRE_FILTERS=filters/remove-sg-forms.est filters/remove-pl-forms.est filters/remove-sg-nom-forms.est filters/remove-non-gi-forms.est filters/block-derivations.est # filters/remove-usage-tags.est +#EST_GUESSER_PRE_FILTERS=filters/block-guesser-derivations.est +EST_EXTRA_POST_FILTERS=filters/modify-derivations.est # filters/downcase-derived_proper-strings.est +EST_WORDPAIR_FILTERS=filters/reorder-tags.est filters/wordpair-filter.est +EST_NUMERAL_FILTERS=filters/reorder-tags.est filters/numeral-filter.est +#EST_COMPOUND_PRE_FILTERS=filters/block-compounds.est +#EST_COMPOUND_POST_FILTERS=filters/compound-filter.est + +# paired words are words whose both parts inflect, e.g. emb-kumb, kihin-kahin +# they are compiled into a transducer of its own: +# initial_part final_part +# this transducer is union-ed with the transducer of the rest of vocabulary (?) + +# paired words: initial part +# HFST: +pair_initial.tmp1.hfst: fst/pair_initial.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + +# XEROX +pair_initial.tmp1.xfst: fst/pair_initial.tmp.xfst \ + fst/phonology.compose.xfst + $(AM_V_LEXC)$(PRINTF) \ + "read-source fst/pair_initial.tmp.xfst\n\ + read-rules fst/phon.compose.xfst\n\ + compose-result\n\ + save-result $@\n\ + quit\n" \ + | $(LEXC) $(VERBOSITY) + +# compound numerals where both parts inflect, e.g. viis#sada, viie#saja +# they are compiled into a transducer of its own +# this transducer is union-ed with the transducer of the simplex words + +# compound numerals: initial part +# HFST: +num_initial.tmp1.hfst: fst/num_initial.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + +# XEROX +num_initial.tmp1.xfst: fst/num_initial.tmp.xfst \ + fst/phonology.compose.xfst + $(AM_V_LEXC)$(PRINTF) \ + "read-source fst/num_initial.tmp.xfst\n\ + read-rules fst/phonology.compose.xfst\n\ + compose-result\nsave-result $@\n\ + quit\n" \ + | $(LEXC) $(VERBOSITY) + + +# paired words: final part +# HFST: +pair_final.tmp1.hfst: fst/pair_final.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + +# XEROX +pair_final.tmp1.xfst: fst/pair_final.tmp.xfst \ + fst/phonology.compose.xfst + $(AM_V_LEXC)$(PRINTF) \ + "read-source fst/pair_final.tmp.xfst\n\ + read-rules fst/phonology.compose.xfst\n\ + compose-result\n\ + save-result $@\n\ + quit\n" \ + | $(LEXC) $(VERBOSITY) + +# compound numerals: final part +# HFST: +num_final.tmp1.hfst: fst/num_final.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + +# XEROX +num_final.tmp1.xfst: fst/num_final.tmp.xfst \ + fst/phonology.compose.xfst + $(AM_V_LEXC)$(PRINTF) \ + "read-source fst/num_final.tmp.xfst\n\ + read-rules fst/phonology.compose.xfst\n\ + compose-result\n\ + save-result $@\n\ + quit\n" \ + | $(LEXC) $(VERBOSITY) + +# guesser + +# guesser for simplex words +# create draft "phonological" name patterns: +# select only nouns, tag them as proper nouns, +# and upcase both the lexical and the surface side +# (perhaps this could be done more elegantly, i.e. in some other dir and/or makefile) +fst/guesser-names.tmp.hfst: fst/guesser-names.tmp0.hfst filters/upcase-guessed-names.est.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon OFF\n\ + read regex \ + [ \"+Guess\" \"+N\" \"+Prop\" <- \"+Guess\" \"+N\" ] \ + .o. \$$[\"+Guess\" \"+N\"] \ + .o. @\"filters/upcase-guessed-names.est.hfst\".i \ + .o. @\"$<\" \ + .o. @\"filters/upcase-guessed-names.est.hfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# phonological simplex word patterns with inflections +guesser-simplex-nouns.tmp1.hfst: fst/guesser-simplex-nouns.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + +# phonological name patterns with inflections +guesser-names.tmp1.hfst: fst/guesser-names.tmp.hfst \ + fst/phonology.compose.hfst + $(AM_V_INTRSCT)\ + $(HFST_DETERMINIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) $<\ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + | $(HFST_COMPOSE_INTERSECT) $(COMPOSE_INTERSECT_FLAG) \ + $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -2 fst/phonology.compose.hfst \ + | $(HFST_MINIMIZE) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -o $@ + + +# XEROX +# not implemented... + +# acronyms +# acronyms.tmp1.%: fst/acronyms.tmp.% +# cp $< $@ + +# FOMA +# not implemented... + +# HFST: generator +# Xerox & FOMA: analyser +# (with a language-specific tag reordering script applied) +pair_initial.tmp.%: pair_initial.tmp1.% \ + filters/reorder-tags.$(GTLANG).% \ + filters/reorder-semantic-tags.% \ + filters/reorder-subpos-tags.% \ + filters/remove-mwe-tags.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/reorder-tags.$(GTLANG).$*\"\ + .o. @\"filters/reorder-subpos-tags.$*\" \ + .o. @\"filters/reorder-semantic-tags.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +num_initial.tmp.%: num_initial.tmp1.% \ + filters/reorder-tags.$(GTLANG).% \ + filters/reorder-semantic-tags.% \ + filters/reorder-subpos-tags.% \ + filters/remove-mwe-tags.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/reorder-tags.$(GTLANG).$*\"\ + .o. @\"filters/reorder-subpos-tags.$*\" \ + .o. @\"filters/reorder-semantic-tags.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# HFST: generator +# Xerox & FOMA: analyser +# (with a language-specific tag reordering script applied) +pair_final.tmp.%: pair_final.tmp1.% \ + filters/reorder-tags.$(GTLANG).% \ + filters/reorder-semantic-tags.% \ + filters/reorder-subpos-tags.% \ + filters/remove-mwe-tags.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/reorder-tags.$(GTLANG).$*\"\ + .o. @\"filters/reorder-subpos-tags.$*\" \ + .o. @\"filters/reorder-semantic-tags.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +num_final.tmp.%: num_final.tmp1.% \ + filters/reorder-tags.$(GTLANG).% \ + filters/reorder-semantic-tags.% \ + filters/reorder-subpos-tags.% \ + filters/remove-mwe-tags.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/reorder-tags.$(GTLANG).$*\"\ + .o. @\"filters/reorder-subpos-tags.$*\" \ + .o. @\"filters/reorder-semantic-tags.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# phonological simplex word and name patterns with inflections +# (with lexical-side tags ordered correctly) +guesser-simplex.tmp.%: guesser-simplex-nouns.tmp1.% \ + guesser-names.tmp1.% \ + filters/reorder-tags.$(GTLANG).% \ + filters/reorder-semantic-tags.% \ + filters/reorder-subpos-tags.% \ + filters/remove-mwe-tags.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/reorder-tags.$(GTLANG).$*\"\ + .o. @\"filters/reorder-subpos-tags.$*\" \ + .o. @\"filters/reorder-semantic-tags.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. [@\"$<\" | @\"guesser-names.tmp1.$*\" ]\ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + + +# HFST: generator +# Xerox & FOMA: analyser +# +# concatenate initial and final part of paired words and numerals + +redundant_wordpairs.%: pair_final.tmp.% pair_initial.tmp.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex [ [~[?* \"+Foc/gi\" ?*] \ + .o. @\"pair_initial.tmp.$*\"] (\"-\") [ 0:\"#\" ] @\"pair_final.tmp.$*\"] \ + .o. ~[?* » ?*] ; \nsave stack $@\nquit\n" | $(XFST_TOOL) + +redundant_numerals.%: num_final.tmp.% num_initial.tmp.% + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex [ [~[?* \"+Foc/gi\" ?*] \ + .o. @\"num_initial.tmp.$*\"] @\"num_final.tmp.$*\"] \ + .o. ~[?* » ?*] ; \nsave stack $@\nquit\n" | $(XFST_TOOL) + +# HFST: +# filter out ungrammatical wordforms of paired words and numerals +# the result is a transducer that can be unioned with simple words lexicon to arrive at the set of simplex words and derivations + +generator-wordpairs-raw.simple.hfst: redundant_wordpairs.hfst \ + $(EST_WORDPAIR_FILTERS:%=%.hfst) + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_WORDPAIR_FILTERS:%=@\"%.hfst\" .o.) \ + @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +generator-numerals-raw.simple.hfst: redundant_numerals.hfst \ + $(EST_NUMERAL_FILTERS:%=%.hfst) + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_NUMERAL_FILTERS:%=@\"%.hfst\" .o.) \ + @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# guesser: +# phological patterns of simplex words and derived words +guesser-raw.simple.hfst: guesser-simplex.tmp.hfst \ + $(EST_EXTRA_PRE_FILTERS:%=%.hfst) \ + $(EST_EXTRA_POST_FILTERS:%=%.hfst) \ + filters/downcase-derived_proper-strings.est.hfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/block-derivations.est.hfst\" \ + .o. [ @\"$<\"] \ + $(EST_EXTRA_POST_FILTERS:%=.o. @\"%.hfst\") \ + ;\n\ + define fst \n\ + set flag-is-epsilon OFF\n\ + read regex fst \ + .o. @\"filters/downcase-derived_proper-strings.est.hfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# XEROX: +analyser-wordpairs-raw.simple.xfst: redundant_wordpairs.xfst \ + $(EST_WORDPAIR_FILTERS:%=%.xfst) + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_WORDPAIR_FILTERS:%=@\"%.xfst\" .o.) \ + @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST) $(VERBOSITY) + +analyser-numerals-raw.simple.xfst: redundant_numerals.xfst \ + $(EST_NUMERAL_FILTERS:%=%.xfst) + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_NUMERAL_FILTERS:%=@\"%.xfst\" .o.) \ + @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST) $(VERBOSITY) + +# We need to add processing of language-specific tags in the analyser: +# XEROX: +# NB! cleanup net +analyser-raw-gt-desc.simple.xfst: analyser-raw-gt-desc.tmp.xfst \ + analyser-numerals-raw.simple.xfst \ + $(EST_EXTRA_PRE_FILTERS:%=%.xfst) \ + $(EST_EXTRA_POST_FILTERS:%=%.xfst) \ + filters/downcase-derived_proper-strings.est.xfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_EXTRA_PRE_FILTERS:%=@\"%.xfst\" .o.) \ + [ @\"$<\" | @\"analyser-numerals-raw.simple.xfst\" ] \ + $(EST_EXTRA_POST_FILTERS:%=.o. @\"%.xfst\") \ + ;\n\ + cleanup net\n\ + define fst \n\ + set flag-is-epsilon OFF\n\ + read regex fst \ + .o. @\"filters/downcase-derived_proper-strings.est.xfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST) $(VERBOSITY) + +# HFST: +# 1) make a union of simple words, paired words and compound numerals +# 2) create derivations from proper names +# 3) filter out the incorrect derivations (derived from names, verbs, nouns etc) +# result: lexicon-based simplex words and derivations +# NB! includes potential compound word initial components tagged as +Guess, e.g. blabla; +# they will be legit parts of compound words, once the compound word transducer is created + +generator-raw-gt-desc.simple.weightless.hfst: generator-raw-gt-desc.tmp.hfst \ + generator-numerals-raw.simple.hfst \ + generator-wordpairs-raw.simple.hfst \ + $(EST_EXTRA_PRE_FILTERS:%=%.hfst) \ + $(EST_EXTRA_POST_FILTERS:%=%.hfst) \ + filters/downcase-derived_proper-strings.est.hfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + $(EST_EXTRA_PRE_FILTERS:%=@\"%.hfst\" .o.) \ + [ @\"$<\" | @\"generator-numerals-raw.simple.hfst\" \ + | @\"generator-wordpairs-raw.simple.hfst\"] \ + $(EST_EXTRA_POST_FILTERS:%=.o. @\"%.hfst\") \ + ;\n\ + define fst \n\ + set flag-is-epsilon OFF\n\ + read regex fst \ + .o. @\"filters/downcase-derived_proper-strings.est.hfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# weights added to all analyses +# result: lexicon-based simplex words and derivations with weights + +# no weight added to compound border '#' here; do it somewhere else +# 7.01.2019 from Sjur: +# Hfst - add weights to simplex words if using tropical-semiring fst format: +if WITH_OFST_TROPICAL +generator-raw-gt-desc.simple.hfst: generator-raw-gt-desc.simple.weightless.hfst + $(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -S '#' -a 0 --arcs-only -i $< \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mine' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ja' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nu' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mus' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ng' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/v' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tav' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nud' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mata' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/matu' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tamatu' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tu' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tud' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lik' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/line' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ne' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lt' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/sti' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ini' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/m' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/im' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nna' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/kond' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ist' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/is' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/us' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ti' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lane' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/kas' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+N' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+A' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Num' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pron' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+V' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Adv' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Interj' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+CC' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+CS' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Adp' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pref' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prop' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Card' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ord' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Comp' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Superl' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Nom' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Gen' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Par' -a 2 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ill' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ine' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ela' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+All' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ade' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Abl' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Tra' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Trm' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ess' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Abe' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Com' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Impers' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pers' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prs' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prt' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ind' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Cond' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Imprt' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Quot' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg1' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg2' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg3' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl1' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl2' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl3' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Aff' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Neg' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sup' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Inf' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prc' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Foc/gi' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Emph' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pref' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Dim/ke' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+ABBR' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+ACR' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/Rare' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/Hyp' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/NotNorm' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/CommonNotNorm' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Use/Circ' -a 0 -A \ + > $@ + +# do somewhere else: +# -S '#' -a 30 --arcs-only -i \ +# | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Guess' -a 200 -A \ +# + +else !WITH_OFST_TROPICAL + +generator-raw-gt-desc.simple.hfst: generator-raw-gt-desc.simple.weightless.hfst + cp $< $@ +endif !WITH_OFST_TROPICAL + + +# HFST: +# compound words: +# 1. the non-final parts cannot be a form with a focus particle, so filter them out +# 2. the non-final parts part may end with a hyphen (ajalooline+A+Der/minus:ajaloolis»-), which may be omitted in compounds, +# or may have a hyphen appended (for better readabilty of a compound) +# result: lexicon-based simplex words, derivations, paired words and numerals, compound words; everything with weights +# NB! includes words where the first part is marked as +Guess, e.g. blablawords + +generator-raw-gt-desc.comp.hfst: generator-raw-gt-desc.simple.hfst \ + filters/evaluate-flags.est.hfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + [ \ + [ \ + [ \ + ~[?* \"+Foc/gi\" ?*] .o. @\"$<\" \ + .o. [[ \"-\" (->) 0 || » _ .#. ] | [ [..] (->) \"-\" || \\[\"-\"] _ .#. ]] \ + ] \"#\" \ + ]* @\"$<\" \ + ] @\"filters/evaluate-flags.est.hfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# XFST: +analyser-raw-gt-desc.comp.xfst: analyser-raw-gt-desc.simple.xfst \ + filters/evaluate-flags.est.xfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + [ \ + [ \ + [ \ + ~[?* \"+Foc/gi\" ?*] .o. @\"$<\" \ + .o. [[ \"-\" (->) 0 || » _ .#. ] | [ [..] (->) \"-\" || \\[\"-\"] _ .#. ]] \ + ] \"#\" \ + ]* @\"$<\" \ + ] @\"filters/evaluate-flags.est.xfst\" \ + ;\n\ + cleanup net\n\ + save stack $@\n\ + quit\n" | $(XFST) $(VERBOSITY) + +# the vocabulary, i.e. words that might be combined with - / or otherwise + +# HFST: +generator-raw-gt-desc.vocabulary.hfst: generator-raw-gt-desc.comp.hfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex @\"$<\" ; \n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# XEROX: +analyser-raw-gt-desc.vocabulary.xfst: analyser-raw-gt-desc.comp.xfst \ + analyser-wordpairs-raw.simple.xfst + $(AM_V_XFST)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex @\"$<\" ; \n\ + read regex @\"analyser-wordpairs-raw.simple.xfst\" ; \n\ + union net\n\ + cleanup net\n\ + save stack $@\n\ + quit\n" | $(XFST) $(VERBOSITY) + +# weights added to all analyses +# 7.01.2019 from Sjur: +# Hfst - add weights to compounds if using tropical-semiring fst format: +if WITH_OFST_TROPICAL +generator-raw-gt-desc.weighted.hfst: generator-raw-gt-desc.vocabulary.hfst + $(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -S '#' -a 30 --arcs-only -i $< \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Guess' -a 200 -A \ + > $@ + +else !WITH_OFST_TROPICAL + +generator-raw-gt-desc.weighted.hfst: generator-raw-gt-desc.vocabulary.hfst + cp $< $@ +endif !WITH_OFST_TROPICAL + + +# weights added to all guessed simplex word analyses +# result: guessed simplex words and derivations with weights + +# analogy with generator-raw-gt-desc.weighted.hfst +# notice that the derived forms are weighted LESS, i.e. they are guessed MORE LIKELY than simplex forms +# Hfst - add weights to compounds if using tropical-semiring fst format: +if WITH_OFST_TROPICAL +guesser-raw.weighted.hfst: guesser-raw.simple.hfst + $(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) \ + -S '#' -a 30 --arcs-only -i $< \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mine' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ja' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nu' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mus' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ng' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/v' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tav' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nud' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/mata' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/matu' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tamatu' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tu' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/tud' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lik' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/line' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ne' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lt' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/sti' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ini' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/m' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/im' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/nna' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/kond' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ist' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/is' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/us' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/ti' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/lane' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Der/kas' -a -10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+N' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+A' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Num' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pron' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+V' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Adv' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Interj' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+CC' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+CS' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Adp' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pref' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prop' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Card' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ord' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Comp' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Superl' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Nom' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Gen' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Par' -a 2 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ill' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ine' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ela' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+All' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ade' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Abl' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Tra' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Trm' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ess' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Abe' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Com' -a 3 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Impers' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pers' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prs' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prt' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ind' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Cond' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Imprt' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Quot' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg1' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg2' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sg3' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl1' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl2' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pl3' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Aff' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Neg' -a 1 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Sup' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Inf' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Ger' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Prc' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Foc/gi' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Emph' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Pref' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Dim/ke' -a 10 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+ABBR' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+ACR' -a 5 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/Rare' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/Hyp' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/NotNorm' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Usage/CommonNotNorm' -a 30 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Use/Circ' -a 0 -A \ + | $(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) -S '+Guess' -a 200 -A \ + > $@ + +else !WITH_OFST_TROPICAL + +guesser-raw.weighted.hfst: guesser-raw.simple.hfst + cp $< $@ +endif !WITH_OFST_TROPICAL + +# make the raw ones +# HFST: +# map the name to GT/Divvun conventions + +generator-raw-gt-desc.hfst: generator-raw-gt-desc.weighted.hfst + cp $< $@ + +# Tokens ending with a dot (e.g. abbreviations) need special treament by a tokeniser +# They cannot be a part of the analyser that the tokeniser uses +# Therefore, dot-ending stuff must be added separately to the default descriptive analyser + +# .dot transducer is the the base for: +# 1. -desc, -norm etc transducers +# 2. guesser + +# This is the default, descriptive analyser: +# Visible tags (ie do NOT remove): +# - variant tags +# - the Err/Orth tag +# Invisible tags (ie to be removed): +# - semantic tags +# - homonymy tags + +analyser-gt-desc.dot.tmp.%: analyser-raw-gt-desc.% \ + fst/abbrevdot.tmp.% \ + filters/remove-area-tags.% \ + filters/remove-dialect-tags.% \ + filters/remove-number-string-tags.% \ + filters/remove-usage-tags.% \ + filters/remove-semantic-tags.% \ + filters/remove-hyphenation-marks.% \ + filters/remove-infl_deriv-borders.% \ + filters/remove-word-boundary.% \ + filters/remove-orthography-tags.% \ + filters/remove-Orth_IPA-strings.% \ + filters/remove-orig_lang-tags.% \ + filters/remove-Use_GC-strings.% \ + filters/remove-Use_minusGC-tags.% \ + filters/remove-Use_minus_PMatch-tags.% \ + filters/remove-Use_PMatch-strings.% \ + filters/remove-mwe-tags.% \ + orthography/inituppercase.compose.% \ + orthography/allcaps.compose.% \ + orthography/spellrelax.compose.% \ + $(GLT_DOWNCASE_FILTER) + $(AM_V_XFST_TOOL)$(PRINTF) "read regex \ + @\"filters/remove-area-tags.$*\" \ + .o. @\"filters/remove-dialect-tags.$*\" \ + .o. @\"filters/remove-number-string-tags.$*\" \ + .o. @\"filters/remove-usage-tags.$*\" \ + .o. @\"filters/remove-semantic-tags.$*\" \ + .o. @\"filters/remove-orig_lang-tags.$*\" \ + .o. @\"filters/remove-orthography-tags.$*\" \ + .o. @\"filters/remove-Orth_IPA-strings.$*\" \ + .o. @\"filters/remove-Use_minus_PMatch-tags.$*\" \ + .o. @\"filters/remove-Use_GC-strings.$*\" \ + .o. @\"filters/remove-Use_minusGC-tags.$*\" \ + .o. @\"filters/remove-Use_PMatch-strings.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. [@\"$<\" | @\"fst/abbrevdot.tmp.$*\"] \ + $(GLT_DOWNCASE_COMPOSE) \ + .o. @\"filters/remove-hyphenation-marks.$*\" \ + .o. @\"filters/remove-infl_deriv-borders.$*\" \ + .o. @\"filters/remove-word-boundary.$*\" \ + ; \n\ + define fst \n\ + set flag-is-epsilon ON\n\ + set encode-weights ON\n\ + read regex fst \ + .o. @\"orthography/inituppercase.compose.$*\" \ + .o. @\"orthography/allcaps.compose.$*\" \ + .o. @\"orthography/spellrelax.compose.$*\" \ + ; \n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# This is the default, descriptive generating transducer. +generator-gt-desc.dot.tmp.%: analyser-raw-gt-desc.% \ + fst/abbrevdot.tmp.% \ + filters/make-optional-transitivity-tags.% \ + filters/make-optional-homonymy-tags.% \ + filters/make-optional-hyph-tags.% \ + filters/make-optional-variant-tags.% \ + filters/make-optional-semantic-tags.% \ + filters/make-optional-error-tags.% \ + filters/make-optional-adv_comp-tags.% \ + filters/make-optional-orig_lang-tags.% \ + filters/remove-area-tags.% \ + filters/remove-dialect-tags.% \ + filters/remove-hyphenation-marks.% \ + filters/remove-infl_deriv-borders.% \ + filters/remove-word-boundary.% \ + filters/remove-number-string-tags.% \ + filters/remove-orthography-tags.% \ + filters/remove-Orth_IPA-strings.% \ + filters/remove-usage-tags.% \ + filters/remove-Use_GC-strings.% \ + filters/remove-Use_minusGC-tags.% \ + filters/remove-Use_minus_PMatch-tags.% \ + filters/remove-Use_PMatch-strings.% \ + filters/remove-mwe-tags.% \ + $(GLT_DOWNCASE_FILTER) + $(AM_V_XFST_TOOL)$(PRINTF) "read regex \ + @\"filters/make-optional-transitivity-tags.$*\" \ + .o. @\"filters/make-optional-homonymy-tags.$*\" \ + .o. @\"filters/make-optional-hyph-tags.$*\" \ + .o. @\"filters/make-optional-variant-tags.$*\" \ + .o. @\"filters/make-optional-semantic-tags.$*\" \ + .o. @\"filters/make-optional-error-tags.$*\" \ + .o. @\"filters/make-optional-adv_comp-tags.$*\" \ + .o. @\"filters/make-optional-orig_lang-tags.$*\" \ + .o. @\"filters/remove-area-tags.$*\" \ + .o. @\"filters/remove-dialect-tags.$*\" \ + .o. @\"filters/remove-number-string-tags.$*\" \ + .o. @\"filters/remove-usage-tags.$*\" \ + .o. @\"filters/remove-orthography-tags.$*\" \ + .o. @\"filters/remove-Orth_IPA-strings.$*\" \ + .o. @\"filters/remove-Use_minus_PMatch-tags.$*\" \ + .o. @\"filters/remove-Use_GC-strings.$*\" \ + .o. @\"filters/remove-Use_minusGC-tags.$*\" \ + .o. @\"filters/remove-Use_PMatch-strings.$*\" \ + .o. @\"filters/remove-mwe-tags.$*\" \ + .o. [@\"$<\" | @\"fst/abbrevdot.tmp.$*\"] \ + $(GLT_DOWNCASE_COMPOSE) \ + .o. @\"filters/remove-hyphenation-marks.$*\" \ + .o. @\"filters/remove-infl_deriv-borders.$*\" \ + .o. @\"filters/remove-word-boundary.$*\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# override the default descriptive analyser and generator +# result: lexicon-based analyses + +analyser-gt-desc.tmp.hfst: analyser-gt-desc.dot.tmp.hfst \ + filters/remove-guessed-forms.est.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "read regex \ + @\"filters/remove-guessed-forms.est.hfst\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +generator-gt-desc.tmp.hfst: generator-gt-desc.dot.tmp.hfst \ + filters/remove-guessed-forms.est.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "read regex \ + @\"filters/remove-guessed-forms.est.hfst\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +#--- begin guesser-related ad hoc + +# substitute placeholders with fsts containing real symbols; +# relax hyphen and apostrophe writing conventions +# result: lexicon-based analyses plus blablawords +# this will be later unioned with simplex word guesser to result in a complete guesser + +analyser-gt-descguess.hfst: analyser-gt-desc.dot.tmp.hfst \ + fst/substitute_blockcap.xfscript \ + fst/substitutions.xfscript \ + orthography/punctrelax.compose.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set encode-weights ON\n\ + read regex @\"$<\";\n\ + source fst/substitute_blockcap.xfscript\n\ + source fst/substitutions.xfscript\n\ + define fst\n\ + read regex fst \ + .o. @\"orthography/punctrelax.compose.hfst\" \ + ;\n\ + $(INVERT_HFST)\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# ... and generators: +# substitute placeholders with fsts containing real symbols -pair_initial.tmp.lexc: pair_initial.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) | sed 's/@.\.[^@]*@//g' >> $@ - -num_initial.tmp.lexc: num_initial.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) | sed 's/@.\.[^@]*@//g' >> $@ - -pair_final.tmp.lexc: pair_final.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ - -num_final.tmp.lexc: num_final.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ - -# abbreviations with a dot ... -# ... should be kept separately because of tokenisation issues -# copy the abbrevs that can have a final dot and format them appropriately: -# attach a final dot and continuation lexicon ; -# and add flag diacritics to prevent these abbreviations to be part of some compound word - -abbrevdot.tmp.lexc: stems/abbreviations.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - echo 'LEXICON Root' >> $@ && \ - echo ' @D.Part@@P.Part.Bad@ DABBR ;' >> $@ && \ - echo 'LEXICON DABBR' >> $@ && \ - $(AM_V_GEN)cat $< | grep '^ *!.*DOTABBR' | sed 's/^ *!//' >> $@ && \ - $(AM_V_GEN)cat $< | grep 'may also end with a dot' | \ - sed 's/:\([^ ]*\) *[^ ]* *; *! *may also end with a dot/:\1 DOTABBR ;/' >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ - - -# guesser: -# re-using root.lexc and affixes -guesser-simplex-nouns.tmp.lexc: guesser-simplex-nouns.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ - -guesser-names.tmp0.lexc: guesser-simplex-nouns.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) - $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ - cat $< >> $@ && \ - cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ - -### END: Local processing: ### - -####### Other targets: ########### -# Clean: add local clean targets on separate lines, so that the first line can -# easily get updates from the template dir through svn merge. -clean-local: - -rm -f *.all.* *fst *.foma *.script generated_files/*.lexc lexicon.* - -rm -f url.lexc *.tmp* - -rm -f *.relabel lexicon-tags.* lexicon-sigma.* - -include $(srcdir)/Makefile.modifications-phon.am -include $(top_srcdir)/../giella-core/am-shared/src-morphology-dir-include.am - -# vim: set ft=automake: +generator-gt-descguess.hfst: generator-gt-desc.dot.tmp.hfst \ + fst/substitute_blockcap.xfscript \ + fst/substitutions.xfscript + $(AM_V_XFST_TOOL)$(PRINTF) "set encode-weights ON\n\ + read regex @\"$<\";\n\ + source fst/substitute_blockcap.xfscript\n\ + source fst/substitutions.xfscript\n\ + $(INVERT_XFST)$(INVERT_FOMA)\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# --- end guesser-related ad hoc + +# override the default normative analyser and generator +# (is it necessary actually? where are they used ?) +analyser-gt-norm.tmp.hfst: analyser-gt-desc.tmp.hfst \ + filters/remove-nospell-words.est.hfst \ + filters/remove-NotNorm-wordforms.est.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/remove-nospell-words.est.hfst\" \ + .o. @\"filters/remove-NotNorm-wordforms.est.hfst\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +generator-gt-norm.tmp.hfst: generator-gt-desc.tmp.hfst \ + filters/remove-nospell-words.est.hfst \ + filters/remove-NotNorm-wordforms.est.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"filters/remove-nospell-words.est.hfst\" \ + .o. @\"filters/remove-NotNorm-wordforms.est.hfst\" \ + .o. @\"$<\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# HFST: +# simplex word guesser; +# result: guessed simplex words and derivations with weights +# (surface side without phonotactics symbols) +generator-raw-gt-guess.hfst: guesser-raw.weighted.hfst \ + filters/remove-hyphenation-marks.hfst \ + filters/remove-infl_deriv-borders.hfst \ + filters/remove-word-boundary.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "read regex \ + @\"$<\" \ + .o. @\"filters/remove-hyphenation-marks.hfst\" \ + .o. @\"filters/remove-infl_deriv-borders.hfst\" \ + .o. @\"filters/remove-word-boundary.hfst\" \ + ;\n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +# HFST: +# simplex word guesser, enlarged to cover standard orthography +# upcasing added (like for analyser-gt-desc) +# perhaps a spellrelax filter should be also added ? +analyser-raw-gt-guess.hfst: generator-raw-gt-guess.hfst \ + orthography/inituppercase.compose.hfst \ + orthography/allcaps.compose.hfst \ + orthography/spellrelax.compose.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set flag-is-epsilon ON\n\ + read regex \ + @\"$<\" \ + .o. [@\"orthography/inituppercase.compose.hfst\" \ + | @\"orthography/allcaps.compose.hfst\" ] \ + .o. @\"orthography/spellrelax.compose.hfst\" \ + ;\n\ + invert net \n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# XEROX: +# probably garbage anyway +analyser-raw-gt-desc.xfst: analyser-raw-gt-desc.vocabulary.xfst + cp $< $@ + +# complete guesser +# includes +# 1) phonological patterns for simplex words (and names) and their derivations +# 2) all the lexicon-based words (simplex, compound, derived; paired) +# 3) blablawords, i.e. words where the last component gets an analysis, and the rest is +Guess + +# katsetus +generator-gt-guess.hfst: generator-gt-descguess.hfst \ + generator-raw-gt-guess.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set encode-weights ON\n\ + read regex \ + [[ @\"$<\"] | @\"generator-raw-gt-guess.hfst\"] \ + ; \n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + +analyser-gt-guess.hfst: analyser-gt-descguess.hfst \ + analyser-raw-gt-guess.hfst + $(AM_V_XFST_TOOL)$(PRINTF) "set encode-weights ON\n\ + read regex \ + [[ @\"$<\" ] | @\"analyser-raw-gt-guess.hfst\"] \ + ; \n\ + save stack $@\n\ + quit\n" | $(XFST_TOOL) + + +# XEROX: +# not implemented... +# + + +################################################################## +#### END: Add local processing instructions ABOVE this line ###### +################################################################## +include $(top_srcdir)/../giella-core/am-shared/src-fst-dir-include.am diff --git a/src/filters/.gitignore b/src/fst/filters/.gitignore similarity index 100% rename from src/filters/.gitignore rename to src/fst/filters/.gitignore diff --git a/src/filters/Makefile.am b/src/fst/filters/Makefile.am similarity index 64% rename from src/filters/Makefile.am rename to src/fst/filters/Makefile.am index 643fbf9e..4724b67d 100644 --- a/src/filters/Makefile.am +++ b/src/fst/filters/Makefile.am @@ -27,23 +27,7 @@ GIELLA_FILTER_LOCAL_REGEX_SRCS=\ rename-POS_before_Der-tags.regex # List any local filter xfscript files here: -GIELLA_FILTER_LOCAL_XFSCRIPT_SRCS=reorder-tags.est.xfscript \ - remove-sg-forms.est.xfscript \ - remove-pl-forms.est.xfscript \ - remove-sg-nom-forms.est.xfscript \ - remove-non-gi-forms.est.xfscript \ - remove-usage-tags.est.xfscript \ - remove-nospell-words.est.xfscript \ - remove-NotNorm-wordforms.est.xfscript \ - modify-derivations.est.xfscript \ - block-derivations.est.xfscript \ - block-compounds.est.xfscript \ - wordpair-filter.est.xfscript \ - numeral-filter.est.xfscript \ - evaluate-flags.est.xfscript \ - downcase-derived_proper-strings.est.xfscript \ - upcase-guessed-names.est.xfscript \ - remove-guessed-forms.est.xfscript +GIELLA_FILTER_LOCAL_XFSCRIPT_SRCS= # List any local filter lexc files here: GIELLA_FILTER_LOCAL_LEXC_SRCS= diff --git a/src/filters/block-compounds.est.xfscript b/src/fst/filters/block-compounds.est.xfscript similarity index 100% rename from src/filters/block-compounds.est.xfscript rename to src/fst/filters/block-compounds.est.xfscript diff --git a/src/filters/block-derivations.est.xfscript b/src/fst/filters/block-derivations.est.xfscript similarity index 100% rename from src/filters/block-derivations.est.xfscript rename to src/fst/filters/block-derivations.est.xfscript diff --git a/src/filters/downcase-derived_proper-strings.est.xfscript b/src/fst/filters/downcase-derived_proper-strings.est.xfscript similarity index 100% rename from src/filters/downcase-derived_proper-strings.est.xfscript rename to src/fst/filters/downcase-derived_proper-strings.est.xfscript diff --git a/src/filters/downcase_UCletters.regex b/src/fst/filters/downcase_UCletters.regex similarity index 100% rename from src/filters/downcase_UCletters.regex rename to src/fst/filters/downcase_UCletters.regex diff --git a/src/filters/evaluate-flags.est.xfscript b/src/fst/filters/evaluate-flags.est.xfscript similarity index 100% rename from src/filters/evaluate-flags.est.xfscript rename to src/fst/filters/evaluate-flags.est.xfscript diff --git a/src/filters/modify-derivations.est.xfscript b/src/fst/filters/modify-derivations.est.xfscript similarity index 100% rename from src/filters/modify-derivations.est.xfscript rename to src/fst/filters/modify-derivations.est.xfscript diff --git a/src/filters/numeral-filter.est.xfscript b/src/fst/filters/numeral-filter.est.xfscript similarity index 100% rename from src/filters/numeral-filter.est.xfscript rename to src/fst/filters/numeral-filter.est.xfscript diff --git a/src/filters/remove-DNorm-tags.regex b/src/fst/filters/remove-DNorm-tags.regex similarity index 100% rename from src/filters/remove-DNorm-tags.regex rename to src/fst/filters/remove-DNorm-tags.regex diff --git a/src/filters/remove-NotNorm-wordforms.est.xfscript b/src/fst/filters/remove-NotNorm-wordforms.est.xfscript similarity index 100% rename from src/filters/remove-NotNorm-wordforms.est.xfscript rename to src/fst/filters/remove-NotNorm-wordforms.est.xfscript diff --git a/src/filters/remove-derivation-position-tags.regex b/src/fst/filters/remove-derivation-position-tags.regex similarity index 100% rename from src/filters/remove-derivation-position-tags.regex rename to src/fst/filters/remove-derivation-position-tags.regex diff --git a/src/filters/remove-guessed-forms.est.xfscript b/src/fst/filters/remove-guessed-forms.est.xfscript similarity index 100% rename from src/filters/remove-guessed-forms.est.xfscript rename to src/fst/filters/remove-guessed-forms.est.xfscript diff --git a/src/filters/remove-non-gi-forms.est.xfscript b/src/fst/filters/remove-non-gi-forms.est.xfscript similarity index 100% rename from src/filters/remove-non-gi-forms.est.xfscript rename to src/fst/filters/remove-non-gi-forms.est.xfscript diff --git a/src/filters/remove-norm-comp-tags.regex b/src/fst/filters/remove-norm-comp-tags.regex similarity index 100% rename from src/filters/remove-norm-comp-tags.regex rename to src/fst/filters/remove-norm-comp-tags.regex diff --git a/src/filters/remove-nospell-words.est.xfscript b/src/fst/filters/remove-nospell-words.est.xfscript similarity index 100% rename from src/filters/remove-nospell-words.est.xfscript rename to src/fst/filters/remove-nospell-words.est.xfscript diff --git a/src/filters/remove-pl-forms.est.xfscript b/src/fst/filters/remove-pl-forms.est.xfscript similarity index 100% rename from src/filters/remove-pl-forms.est.xfscript rename to src/fst/filters/remove-pl-forms.est.xfscript diff --git a/src/filters/remove-sg-forms.est.xfscript b/src/fst/filters/remove-sg-forms.est.xfscript similarity index 100% rename from src/filters/remove-sg-forms.est.xfscript rename to src/fst/filters/remove-sg-forms.est.xfscript diff --git a/src/filters/remove-sg-nom-forms.est.xfscript b/src/fst/filters/remove-sg-nom-forms.est.xfscript similarity index 100% rename from src/filters/remove-sg-nom-forms.est.xfscript rename to src/fst/filters/remove-sg-nom-forms.est.xfscript diff --git a/src/filters/remove-usage-tags.est.xfscript b/src/fst/filters/remove-usage-tags.est.xfscript similarity index 100% rename from src/filters/remove-usage-tags.est.xfscript rename to src/fst/filters/remove-usage-tags.est.xfscript diff --git a/src/filters/rename-POS_before_Der-tags.regex b/src/fst/filters/rename-POS_before_Der-tags.regex similarity index 100% rename from src/filters/rename-POS_before_Der-tags.regex rename to src/fst/filters/rename-POS_before_Der-tags.regex diff --git a/src/filters/reorder-tags.est.xfscript b/src/fst/filters/reorder-tags.est.xfscript similarity index 100% rename from src/filters/reorder-tags.est.xfscript rename to src/fst/filters/reorder-tags.est.xfscript diff --git a/src/filters/upcase-guessed-names.est.xfscript b/src/fst/filters/upcase-guessed-names.est.xfscript similarity index 100% rename from src/filters/upcase-guessed-names.est.xfscript rename to src/fst/filters/upcase-guessed-names.est.xfscript diff --git a/src/filters/upcase-guessed_proper-strings.est.xfscript b/src/fst/filters/upcase-guessed_proper-strings.est.xfscript similarity index 100% rename from src/filters/upcase-guessed_proper-strings.est.xfscript rename to src/fst/filters/upcase-guessed_proper-strings.est.xfscript diff --git a/src/filters/wordpair-filter.est.xfscript b/src/fst/filters/wordpair-filter.est.xfscript similarity index 100% rename from src/filters/wordpair-filter.est.xfscript rename to src/fst/filters/wordpair-filter.est.xfscript diff --git a/src/fst/morphology/Makefile.am b/src/fst/morphology/Makefile.am new file mode 100644 index 00000000..fdd0c51d --- /dev/null +++ b/src/fst/morphology/Makefile.am @@ -0,0 +1,202 @@ +## Process this file with automake to produce Makefile.in + +## Copyright (C) 2011 Samediggi + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +# Add language-specific flags for hfst-lexc compilation here: +if HAVE_SHARED_COMMON +HFST_LEXC_LOCAL_FLAGS= # --Werror # uncomment if lexc is good enough +else +HFST_LEXC_LOCAL_FLAGS= # No --Werror if deps are missing ! +endif + + +####### Morphology source file defs: ######## + +# Set this to name of lexc file containing Multichar_Symbols and LEXICON Root +GT_LEXC_ROOT=$(srcdir)/root.lexc + +# Set this to the names of all regular lexc source files: +GT_LEXC_SRCS_L1_L2=\ + stems/abbreviations.lexc \ + stems/adjectives.lexc \ + stems/noninflecting_adjectives.lexc \ + stems/comparative_adjectives.lexc \ + stems/superlative_adjectives.lexc \ + stems/adpositions.lexc \ + stems/adverbs.lexc \ + stems/conjunctions.lexc \ + stems/genitive_attributes.lexc \ + stems/interjections.lexc \ + stems/nouns.lexc \ + stems/cardinalnumerals.lexc \ + stems/ordinalnumerals.lexc \ + stems/pronouns.lexc \ + stems/propernouns.lexc \ + stems/verbs.lexc \ + stems/noninflecting_verbs.lexc \ + stems/prefixes.lexc \ + stems/final_components.lexc \ + stems/numbers.lexc \ + stems/acronyms.lexc \ + stems/symbol_strings.lexc \ + affixes/regular_declinations.lexc \ + affixes/exceptional_declinations.lexc \ + affixes/verbs.lexc \ + affixes/gi.lexc + +# If you are building an error-detecting L2 analyser, specify the lexc files +# that differ between the regular L1 and the L2 analysers below, in L1 and +# L2 respectively. L2 files must end in "*-L2.lexc". See SME for an example. +L1= + +L2= + +GT_LEXC_SRCS=\ + $(GT_LEXC_SRCS_L1_L2) \ + $(L1) + +GT_LEXC_L2_SRCS=\ + $(GT_LEXC_SRCS_L1_L2) \ + $(L2) + +# Set this to the names of all generated lexc files, if any +GENERATED_LEXC_SRCS=generated_files/mul-$(GLANG)-punctuation.lexc \ + generated_files/mul-$(GLANG)-symbols.lexc + +# change handling of shared lexical data here: +if HAVE_SHARED_COMMON +.generated/url.tmp.lexc: $(gt_SHARED_common)/src/fst/url.lexc + $(MAKE) $(GENDIR) + $(AM_V_CP)cp -f $< $@ + +generated_files/mul-$(GLANG)-%.lexc: $(gt_SHARED_common)/src/fst/stems/%.lexc + $(AM_V_at)$(MKDIR_P) generated_files + $(AM_V_CP)cp -f $< $@ +else +# this is "safe" fallback (compiles but you miss everything) +.generated/url.tmp.lexc: + echo "LEXICON Root" > $@ + echo "< h t t p (s) %: %/ %/ ?*> # ;" >> $@ + +.generated/mul-$(GLANG)-%.lexc: + $(MAKE) $(GENDIR) + echo "! Missing shared common data" > $@ +endif +# add other lexical shared data handling here + +# Set this to the names of all source xml files, if any +GT_XML_SRCS= + +# Define any additional lexc sources here (compiled on their own): +GT_LOCAL_SRCS=\ + pair_initial.tmp.lexc \ + pair_final.tmp.lexc \ + num_initial.tmp.lexc \ + num_final.tmp.lexc \ + abbrevdot.tmp.lexc \ + guesser-simplex-nouns.tmp.lexc \ + guesser-names.tmp0.lexc + +# guesser-derivations.tmp.lexc +# punctuation.tmp.lexc +# acronyms.tmp.lexc + +# Define local xfscripts here: +GT_LOCAL_XFSCRIPT_SRCS= + +# Define here any additional sources just included in the distro: +GT_DISTRO_SRCS= + +### BEGIN: Local processing: ### +EST_AFFIX_FILES=$(srcdir)/affixes/regular_declinations \ + $(srcdir)/affixes/exceptional_declinations \ + $(srcdir)/affixes/verbs \ + $(srcdir)/affixes/gi + +# make the parts inflect +# by re-using root.lexc and affixes, and omitting all the stem lexicons +# ... and remove the flag diacritics from initial parts (why? because otherwise the filter that +# puts the initial and final part together doesn't work ?) + +pair_initial.tmp.lexc: pair_initial.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) | sed 's/@.\.[^@]*@//g' >> $@ + +num_initial.tmp.lexc: num_initial.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) | sed 's/@.\.[^@]*@//g' >> $@ + +pair_final.tmp.lexc: pair_final.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ + +num_final.tmp.lexc: num_final.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ + +# abbreviations with a dot ... +# ... should be kept separately because of tokenisation issues +# copy the abbrevs that can have a final dot and format them appropriately: +# attach a final dot and continuation lexicon ; +# and add flag diacritics to prevent these abbreviations to be part of some compound word + +abbrevdot.tmp.lexc: stems/abbreviations.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + echo 'LEXICON Root' >> $@ && \ + echo ' @D.Part@@P.Part.Bad@ DABBR ;' >> $@ && \ + echo 'LEXICON DABBR' >> $@ && \ + $(AM_V_GEN)cat $< | grep '^ *!.*DOTABBR' | sed 's/^ *!//' >> $@ && \ + $(AM_V_GEN)cat $< | grep 'may also end with a dot' | \ + sed 's/:\([^ ]*\) *[^ ]* *; *! *may also end with a dot/:\1 DOTABBR ;/' >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ + + + +# guesser: +# re-using root.lexc and affixes +guesser-simplex-nouns.tmp.lexc: guesser-simplex-nouns.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ + +guesser-names.tmp0.lexc: guesser-simplex-nouns.lexc $(GT_LEXC_ROOT) $(EST_AFFIX_FILES:%=%.lexc) + $(AM_V_GEN)cat $(GT_LEXC_ROOT) | sed '/^ *LEXICON *Root/,$$d' > $@ && \ + cat $< >> $@ && \ + cat $(EST_AFFIX_FILES:%=%.lexc) >> $@ + +### List additional targets in the following variable, for build targets not +### covered by other means. This comes ***in addition to*** what you can do by +### just targeting lexicon.*: lexicon.tmp.*, and is useful if you want to build +### separate fst's that need further treatment in the src/ dir. See the language +### gle/ for an example of how this is used. + +GIELLA_LOCAL_TARGETS= + +### END: Local processing: ### + +####### Other targets: ########### +# Clean: add local clean targets on separate lines, so that the first line can +# easily get updates from the template dir through svn merge. + +include $(srcdir)/Makefile.modifications-local.am +include $(srcdir)/Makefile.modifications-phon.am +include $(top_srcdir)/../giella-core/am-shared/src-morphology-dir-include.am + +# vim: set ft=automake: diff --git a/src/fst/morphology/Makefile.modifications-local.am b/src/fst/morphology/Makefile.modifications-local.am new file mode 100644 index 00000000..f60bc693 --- /dev/null +++ b/src/fst/morphology/Makefile.modifications-local.am @@ -0,0 +1,20 @@ +## Process this file with automake to produce Makefile.in + +## Copyright (C) 2011 Samediggi + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +# Add local build rules and shared here... + + diff --git a/src/fst/morphology/Makefile.modifications-phon.am b/src/fst/morphology/Makefile.modifications-phon.am new file mode 100644 index 00000000..8c872aee --- /dev/null +++ b/src/fst/morphology/Makefile.modifications-phon.am @@ -0,0 +1,26 @@ +## Process this file with automake to produce Makefile.in + +## Copyright (C) 2011 Samediggi + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +####### Source file defs: ######## + +#! @param GT_TWOLC_MAIN required, the source of phonology +#! @param GT_TWOLC_SUPPLEMENTS optional, other sources to distribute and +#! compile for other things +GT_PHONOLOGY_MAIN=phonology.twolc +GT_PHONOLOGY_SUPPLEMENTS= + +# vim: set ft=automake: diff --git a/src/fst/affixes/exceptional_declinations.lexc b/src/fst/morphology/affixes/exceptional_declinations.lexc similarity index 100% rename from src/fst/affixes/exceptional_declinations.lexc rename to src/fst/morphology/affixes/exceptional_declinations.lexc diff --git a/src/fst/affixes/gi.lexc b/src/fst/morphology/affixes/gi.lexc similarity index 100% rename from src/fst/affixes/gi.lexc rename to src/fst/morphology/affixes/gi.lexc diff --git a/src/fst/affixes/regular_declinations.lexc b/src/fst/morphology/affixes/regular_declinations.lexc similarity index 100% rename from src/fst/affixes/regular_declinations.lexc rename to src/fst/morphology/affixes/regular_declinations.lexc diff --git a/src/fst/affixes/verbs.lexc b/src/fst/morphology/affixes/verbs.lexc similarity index 100% rename from src/fst/affixes/verbs.lexc rename to src/fst/morphology/affixes/verbs.lexc diff --git a/src/fst/generated_files/00README.txt b/src/fst/morphology/generated_files/00README.txt similarity index 100% rename from src/fst/generated_files/00README.txt rename to src/fst/morphology/generated_files/00README.txt diff --git a/src/fst/incoming/00README.txt b/src/fst/morphology/incoming/00README.txt similarity index 100% rename from src/fst/incoming/00README.txt rename to src/fst/morphology/incoming/00README.txt diff --git a/src/fst/phonology.twolc b/src/fst/morphology/phonology.twolc similarity index 100% rename from src/fst/phonology.twolc rename to src/fst/morphology/phonology.twolc diff --git a/src/fst/root.lexc b/src/fst/morphology/root.lexc similarity index 100% rename from src/fst/root.lexc rename to src/fst/morphology/root.lexc diff --git a/src/fst/stems/abbreviations.lexc b/src/fst/morphology/stems/abbreviations.lexc similarity index 100% rename from src/fst/stems/abbreviations.lexc rename to src/fst/morphology/stems/abbreviations.lexc diff --git a/src/fst/stems/acronyms.lexc b/src/fst/morphology/stems/acronyms.lexc similarity index 100% rename from src/fst/stems/acronyms.lexc rename to src/fst/morphology/stems/acronyms.lexc diff --git a/src/fst/stems/adjectives.lexc b/src/fst/morphology/stems/adjectives.lexc similarity index 100% rename from src/fst/stems/adjectives.lexc rename to src/fst/morphology/stems/adjectives.lexc diff --git a/src/fst/stems/adpositions.lexc b/src/fst/morphology/stems/adpositions.lexc similarity index 100% rename from src/fst/stems/adpositions.lexc rename to src/fst/morphology/stems/adpositions.lexc diff --git a/src/fst/stems/adverbs.lexc b/src/fst/morphology/stems/adverbs.lexc similarity index 100% rename from src/fst/stems/adverbs.lexc rename to src/fst/morphology/stems/adverbs.lexc diff --git a/src/fst/stems/cardinalnumerals.lexc b/src/fst/morphology/stems/cardinalnumerals.lexc similarity index 100% rename from src/fst/stems/cardinalnumerals.lexc rename to src/fst/morphology/stems/cardinalnumerals.lexc diff --git a/src/fst/stems/comparative_adjectives.lexc b/src/fst/morphology/stems/comparative_adjectives.lexc similarity index 100% rename from src/fst/stems/comparative_adjectives.lexc rename to src/fst/morphology/stems/comparative_adjectives.lexc diff --git a/src/fst/stems/conjunctions.lexc b/src/fst/morphology/stems/conjunctions.lexc similarity index 100% rename from src/fst/stems/conjunctions.lexc rename to src/fst/morphology/stems/conjunctions.lexc diff --git a/src/fst/stems/final_components.lexc b/src/fst/morphology/stems/final_components.lexc similarity index 100% rename from src/fst/stems/final_components.lexc rename to src/fst/morphology/stems/final_components.lexc diff --git a/src/fst/stems/genitive_attributes.lexc b/src/fst/morphology/stems/genitive_attributes.lexc similarity index 100% rename from src/fst/stems/genitive_attributes.lexc rename to src/fst/morphology/stems/genitive_attributes.lexc diff --git a/src/fst/stems/interjections.lexc b/src/fst/morphology/stems/interjections.lexc similarity index 100% rename from src/fst/stems/interjections.lexc rename to src/fst/morphology/stems/interjections.lexc diff --git a/src/fst/stems/noninflecting_adjectives.lexc b/src/fst/morphology/stems/noninflecting_adjectives.lexc similarity index 100% rename from src/fst/stems/noninflecting_adjectives.lexc rename to src/fst/morphology/stems/noninflecting_adjectives.lexc diff --git a/src/fst/stems/noninflecting_verbs.lexc b/src/fst/morphology/stems/noninflecting_verbs.lexc similarity index 100% rename from src/fst/stems/noninflecting_verbs.lexc rename to src/fst/morphology/stems/noninflecting_verbs.lexc diff --git a/src/fst/stems/nouns.lexc b/src/fst/morphology/stems/nouns.lexc similarity index 100% rename from src/fst/stems/nouns.lexc rename to src/fst/morphology/stems/nouns.lexc diff --git a/src/fst/stems/numbers.lexc b/src/fst/morphology/stems/numbers.lexc similarity index 100% rename from src/fst/stems/numbers.lexc rename to src/fst/morphology/stems/numbers.lexc diff --git a/src/fst/stems/ordinalnumerals.lexc b/src/fst/morphology/stems/ordinalnumerals.lexc similarity index 100% rename from src/fst/stems/ordinalnumerals.lexc rename to src/fst/morphology/stems/ordinalnumerals.lexc diff --git a/src/fst/stems/prefixes.lexc b/src/fst/morphology/stems/prefixes.lexc similarity index 100% rename from src/fst/stems/prefixes.lexc rename to src/fst/morphology/stems/prefixes.lexc diff --git a/src/fst/stems/pronouns.lexc b/src/fst/morphology/stems/pronouns.lexc similarity index 100% rename from src/fst/stems/pronouns.lexc rename to src/fst/morphology/stems/pronouns.lexc diff --git a/src/fst/stems/propernouns.lexc b/src/fst/morphology/stems/propernouns.lexc similarity index 100% rename from src/fst/stems/propernouns.lexc rename to src/fst/morphology/stems/propernouns.lexc diff --git a/src/fst/stems/superlative_adjectives.lexc b/src/fst/morphology/stems/superlative_adjectives.lexc similarity index 100% rename from src/fst/stems/superlative_adjectives.lexc rename to src/fst/morphology/stems/superlative_adjectives.lexc diff --git a/src/fst/stems/symbol_strings.lexc b/src/fst/morphology/stems/symbol_strings.lexc similarity index 100% rename from src/fst/stems/symbol_strings.lexc rename to src/fst/morphology/stems/symbol_strings.lexc diff --git a/src/fst/stems/verbs.lexc b/src/fst/morphology/stems/verbs.lexc similarity index 100% rename from src/fst/stems/verbs.lexc rename to src/fst/morphology/stems/verbs.lexc diff --git a/src/orthography/Makefile.am b/src/fst/orthography/Makefile.am similarity index 93% rename from src/orthography/Makefile.am rename to src/fst/orthography/Makefile.am index d093009f..ff46a78a 100644 --- a/src/orthography/Makefile.am +++ b/src/fst/orthography/Makefile.am @@ -15,8 +15,7 @@ GT_ORTHOGRAPHIC_REGEX_SRCS=\ GT_ORTHOGRAPHIC_XFSCRIPT_SRCS=\ allcaps.xfscript \ downcase-derived_proper-strings.xfscript \ - spellrelax-with-tags.xfscript \ - punctrelax.xfscript + spellrelax-with-tags.xfscript # Add extra targets here: GIELLA_ORTH_EXTRA_TARGETS= diff --git a/src/orthography/allcaps.xfscript b/src/fst/orthography/allcaps.xfscript similarity index 100% rename from src/orthography/allcaps.xfscript rename to src/fst/orthography/allcaps.xfscript diff --git a/src/orthography/downcase-derived_proper-strings.xfscript b/src/fst/orthography/downcase-derived_proper-strings.xfscript similarity index 100% rename from src/orthography/downcase-derived_proper-strings.xfscript rename to src/fst/orthography/downcase-derived_proper-strings.xfscript diff --git a/src/orthography/inituppercase.regex b/src/fst/orthography/inituppercase.regex similarity index 100% rename from src/orthography/inituppercase.regex rename to src/fst/orthography/inituppercase.regex diff --git a/src/orthography/punctrelax.xfscript b/src/fst/orthography/punctrelax.xfscript similarity index 100% rename from src/orthography/punctrelax.xfscript rename to src/fst/orthography/punctrelax.xfscript diff --git a/src/orthography/spellrelax-mobile-keyboard.regex b/src/fst/orthography/spellrelax-mobile-keyboard.regex similarity index 100% rename from src/orthography/spellrelax-mobile-keyboard.regex rename to src/fst/orthography/spellrelax-mobile-keyboard.regex diff --git a/src/orthography/spellrelax-tags.regex b/src/fst/orthography/spellrelax-tags.regex similarity index 100% rename from src/orthography/spellrelax-tags.regex rename to src/fst/orthography/spellrelax-tags.regex diff --git a/src/orthography/spellrelax-with-tags.xfscript b/src/fst/orthography/spellrelax-with-tags.xfscript similarity index 100% rename from src/orthography/spellrelax-with-tags.xfscript rename to src/fst/orthography/spellrelax-with-tags.xfscript diff --git a/src/orthography/spellrelax.regex b/src/fst/orthography/spellrelax.regex similarity index 100% rename from src/orthography/spellrelax.regex rename to src/fst/orthography/spellrelax.regex diff --git a/src/phonetics/Makefile.am b/src/fst/phonetics/Makefile.am similarity index 100% rename from src/phonetics/Makefile.am rename to src/fst/phonetics/Makefile.am diff --git a/src/phonetics/tests/Makefile.am b/src/fst/phonetics/tests/Makefile.am similarity index 100% rename from src/phonetics/tests/Makefile.am rename to src/fst/phonetics/tests/Makefile.am diff --git a/src/phonetics/tests/run_tests.sh.in b/src/fst/phonetics/tests/run_tests.sh.in similarity index 100% rename from src/phonetics/tests/run_tests.sh.in rename to src/fst/phonetics/tests/run_tests.sh.in diff --git a/src/fst/phonetics/tests/tests/Makefile.am b/src/fst/phonetics/tests/tests/Makefile.am new file mode 100644 index 00000000..dc17381e --- /dev/null +++ b/src/fst/phonetics/tests/tests/Makefile.am @@ -0,0 +1,19 @@ +## Process this file with automake to produce Makefile.in +## Copyright: Sámediggi/Divvun/UiT +## Licence: GPL v3+ + +######## Test targets: ########### + +if WANT_PHONETIC + +# List here (space separated) all yaml files to be run as part of make check: +TESTS= + +# List tests that are presently (expected) failures here, ie things that should +# be fixed *later*, but is not critical at the moment: +XFAIL_TESTS= + +endif # WANT_PHONETIC + +#### Do NOT edit below here: #### +include $(top_srcdir)/../giella-core/am-shared/src-phonetics-tests-dir-include.am diff --git a/src/fst/phonetics/tests/tests/run_tests.sh.in b/src/fst/phonetics/tests/tests/run_tests.sh.in new file mode 100644 index 00000000..baaa6f84 --- /dev/null +++ b/src/fst/phonetics/tests/tests/run_tests.sh.in @@ -0,0 +1,89 @@ +#!/bin/bash +## Process this file with configure to produce the actual shell script +## Copyright: Sámediggi/Divvun/UiT +## Licence: GPL v3+ + +# Test runner to test conversion to IPA. + +# Use autotools mechanisms to only run the configured fst types in the tests: +fsttype= +@CAN_HFST_TRUE@fsttype="$fsttype hfst" +@CAN_XFST_TRUE@fsttype="$fsttype xfst" +@CAN_FOMA_TRUE@fsttype="$fsttype foma" + +# Exit if all fst types have been shut off: +if [[ "x$fsttype" == "x" ]]; then + echo "All transducer types have been shut off at configure time." + echo "Nothing to test. SKIPPED." + exit 77 +fi + +fst_num=$(echo "$fsttype" | wc -w) +# Debug: echo Number of fst´s: $fst_num + +fst=$(grep -v '^#' $1 | grep -v '^\s*$' | grep 'fst' | cut -f2) + +# Debug: +# echo FST: $fst + +grep -v '^#' $1 | grep -v '^\s*$' | tail -n +2 | cut -f1 > innput.txt +grep -v '^#' $1 | grep -v '^\s*$' | tail -n +2 | cut -f2 > expect.txt + +###### Start testing: ####### +transducer_found=0 +fails=0 + +# .---------- constant part! +# vvvv vvvv-- colour code +RED='\033[0;31m' +GREEN='\033[0;32m' +BOLD='\033[1m' +NC='\033[0m' # No Color + + +# Loop over the transducer types first - we test both hfst, xfst and foma +# according to the configuration: +for f in $fsttype; do + # DEBUG: echo "Fst loop 35: $f" + if test $f == "xfst"; then + lookuptool="@LOOKUP@ -flags mbTT" + elif test $f == "foma"; then + lookuptool="@FLOOKUP@" + elif test $f == "hfst"; then + lookuptool="@HFST_LOOKUP@ -q" + else + let "Fail += 1" + echo "FAIL: Unknown fst type! FST=$f" + continue + fi + # Run lookup, then clean the output as follows before saving: + # 1. remove extra empty lines + # 2. convert two or more newlines to XXYYZZ (two newlines are cohort separators) + # 3. convert single newlines to ' XXXX ' - marks multiple outputs from single inut + # 4. convert XXYYZZ back to a single newline + # 5. cleanup on the last line + $lookuptool ../$fst.$f < innput.txt | cut -f1-2 \ + | sed '${/^[[:space:]]*$/d;}' \ + | perl -0pe 's/\n\n+/XXYYZZ/g;' \ + | perl -0pe 's/\n/ XXXX /g;' \ + | perl -pe 's/XXYYZZ/\n/g' \ + | sed '$ s/ XXXX $/\n/' > output.${f}.txt + + # The actual test: + diff expect.txt <(cut -f2- output.hfst.txt) + + # Just to be sure, capture the output value - it might give different + # results for hfst, foma and xfst (that would be a bug in the offending + # one, but one never knows). This way the test will fail even if the last + # fst type being tested was successful if one of the earlier types failed. + # + # And if more than one fst type is tested, print output status for each: + if (($? > 0)) ; then + fails=$((fails+1)) + (($fst_num > 1)) && echo -e " ${BOLD}FAILED: ${RED}$f${NC}" + else + (($fst_num > 1)) && echo -e " ${BOLD}PASSED: ${GREEN}$f${NC}" + fi +done + +exit $fails diff --git a/src/phonetics/txt2ipa.xfscript b/src/fst/phonetics/txt2ipa.xfscript similarity index 100% rename from src/phonetics/txt2ipa.xfscript rename to src/fst/phonetics/txt2ipa.xfscript diff --git a/src/hyphenation/Makefile.am b/src/fst/syllabification/Makefile.am similarity index 100% rename from src/hyphenation/Makefile.am rename to src/fst/syllabification/Makefile.am diff --git a/src/hyphenation/hyphenation.xfscript b/src/fst/syllabification/hyphenation.xfscript similarity index 100% rename from src/hyphenation/hyphenation.xfscript rename to src/fst/syllabification/hyphenation.xfscript diff --git a/src/tagsets/Makefile.am b/src/fst/tagsets/Makefile.am similarity index 100% rename from src/tagsets/Makefile.am rename to src/fst/tagsets/Makefile.am diff --git a/src/transcriptions/Makefile.am b/src/fst/transcriptions/Makefile.am similarity index 100% rename from src/transcriptions/Makefile.am rename to src/fst/transcriptions/Makefile.am diff --git a/src/transcriptions/transcriptor-abbrevs2text.lexc b/src/fst/transcriptions/transcriptor-abbrevs2text.lexc similarity index 100% rename from src/transcriptions/transcriptor-abbrevs2text.lexc rename to src/fst/transcriptions/transcriptor-abbrevs2text.lexc diff --git a/src/transcriptions/transcriptor-clock-digit2text.lexc b/src/fst/transcriptions/transcriptor-clock-digit2text.lexc similarity index 100% rename from src/transcriptions/transcriptor-clock-digit2text.lexc rename to src/fst/transcriptions/transcriptor-clock-digit2text.lexc diff --git a/src/transcriptions/transcriptor-date-digit2text.lexc b/src/fst/transcriptions/transcriptor-date-digit2text.lexc similarity index 100% rename from src/transcriptions/transcriptor-date-digit2text.lexc rename to src/fst/transcriptions/transcriptor-date-digit2text.lexc diff --git a/src/transcriptions/transcriptor-numbers-digit2text.lexc b/src/fst/transcriptions/transcriptor-numbers-digit2text.lexc similarity index 100% rename from src/transcriptions/transcriptor-numbers-digit2text.lexc rename to src/fst/transcriptions/transcriptor-numbers-digit2text.lexc diff --git a/test/src/morphology/generate-propernoun-lemmas.sh.in b/test/src/morphology/generate-propernoun-lemmas.sh.in index b0b9541c..4efd8d4d 100644 --- a/test/src/morphology/generate-propernoun-lemmas.sh.in +++ b/test/src/morphology/generate-propernoun-lemmas.sh.in @@ -22,7 +22,7 @@ ###### Variables: ####### POS=propernouns ### in ### -source_file=${srcdir}/../../../src/fst/stems/${POS}.lexc +source_file=${srcdir}/../../../src/fst/morphology/stems/${POS}.lexc generator_file=./../../../src/generator-gt-norm analyser_file=./../../../src/analyser-gt-norm diff --git a/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in b/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in index fc4ef778..0d465350 100755 --- a/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in +++ b/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in @@ -17,7 +17,7 @@ GIELLA_LANG=@GTLANG2@ ospell=@HFST_OSPELL@ ### in ### -source_files=${srcdir}/../../../../../../src/fst/stems/*.lexc +source_files=${srcdir}/../../../../../../src/fst/morphology/stems/*.lexc speller_dir=./../../../../../../tools/spellcheckers ### out ### diff --git a/tools/spellcheckers/soovita.cpp.utf8.1 b/tools/spellcheckers/soovita.cpp.utf8.1 index 2240331c..dc969bc0 100644 --- a/tools/spellcheckers/soovita.cpp.utf8.1 +++ b/tools/spellcheckers/soovita.cpp.utf8.1 @@ -1,651 +1,651 @@ -/* -* soovitaja -* konstrueerib sona alusel stringe, mille korrektsust siis sonastikust -* chkmin() abil kontrollitakse; kui mo~ni on korrektne so~na, siis -* pannakse ta SugAhel -sse -* kogu protsess ka"ib ja"rgmiselt: -* 1. korda soovitaja poole po"o"rdudes po"o"rdutakse SugFirst() poole; -* ja"rgmistel kordadel SugNext() poole -* kui SugAhel == NULL, siis s.t. sobivaid soovitusi pole -*/ -/* -#include -#include -#include -#include - -#include "ini_mrf.h" -#include "soovita.h" -#include "morf.h" -#include "chup.h" -#include "mrflags.h" - -extern MRF_FLAGS mrfFlags; - -char InsertCharsBeg[]="kpstvlmrahnejioü""dfubõg"; // <=0.37: äšöžz // -char InsertChars[]="aeistlunkmodrvghjpäõ""büö"; // <= 0.12: fšžcwy // -char EstAlphas[]="abcdefghijklmnopqrsšzžtuvwõäöüxy"; - -struct _ChangeArray{ - char from; - char *to; -}CA[]={ // kujult, ling, klaviatuurilt // - {'a', "ä" "e" "s"}, - {'b', "" "p" "nvgh"}, - {'c', "" "" "dv"}, - {'d', "" "t" "esr"}, - {'e', "" "a" "sdr"}, - {'f', "" "" "tdrvg"}, - {'g', "" "kjž" "tvhb"}, - {'h', "" "" "ungjb"}, - {'i', "" "j" "uko"}, - {'j', "" "igž" "unkmh"}, - {'k', "" "g" "ilmo"}, - {'l', "" "" "kopö"}, - {'m', "" "" "nkj"}, - {'n', "" "" "mhjb"}, - {'o', "öõ" "u" "ilkp"}, - {'p', "" "b" "loüö"}, - {'q', "" "" "a"}, - {'r', "" "" "etd"}, - {'s', "š" "z" "aed"}, - {'š', "s" "ž" ""}, - {'z', "ž" "s" "a"}, - {'ž', "z" "š" ""}, - {'t', "" "d" "rg"}, - {'u', "ü" "o" "ihj"}, - {'v', "" "" "gb"}, - {'w', "" "" "aes"}, - {'õ', "oö" "" "äü"}, - {'ä', "a" "" "õüö"}, - {'ö', "oõ" "" "lpäü"}, - {'ü', "u" "" "päõö"}, - {'x', "" "" "sd"}, - {'y', "" "ü" "tugh"}, - {'\xa2', "õöo" "" ""}, // o`, o', o^ // - {'\x95', "õöo" "" ""}, - {'\xe4', "õöo" "" ""}, - {'\xa0', "ä""a" "" ""}, // a`, a', a^ // - {'\x85', "ä""a" "" ""}, - {'\x83', "ä""a" "" ""}, - {'\xa3', "üu" "" ""}, // u`, u', u^ // - {'\x97', "üu" "" ""}, - {'\x96', "üu" "" ""}, - {'\0', EstAlphas}, -}; - -struct _ChangeArrayStr{ - char *from; - char *to; -}CAStr[]={ - {"sh", "š"}, - {"zh", "ž"}, - {"x", "ks"}, - {"f", "hv"}, - {"hv", "f"}, - {"ff", "hv"}, - {"mb", "mm"}, - {"ää", "ea"}, - {"g", "dž"}, - {"dž", "g"}, - {"j", "dž"}, - {"dž", "j"}, - {"data", "tada"}, - {"tada", "data"}, - {"o~", "õ"}, - {"a\"", "ä"}, - {"o\"", "ö"}, - {"u\"", "ü"}, - {"s^", "š"}, - {"z^", "ž"}, - {0, 0} -}; - -struct _ChangeArrayStr CAEStr[]={ - {"si", "seid"}, - {"si", "sid"}, - {"seid", "si"}, - {"seid", "sid"}, - {"sid", "si"}, - {"sid", "seid"}, - {0, 0} -}; - -char //roomanr,// tasand, tagasitasand; - -SUG *SugAhel=0; // soovituste ahel // - -// --------------------------------------------------------- // -//#include // - -//#if defined (SPELLER) -char IsInUDR(char *sisse); -char IsInExcUDR(char *sisse); -char *IsInChangeUDR(char *word); -//#endif - -int SugGroupLevel(int Level){ - switch (Level){ - case 0: - case 1: - case 2: - case 3: - return 0; - case 4: - case 5: - return 1; - case 6: - return 2; - default: - return 3; - } -} - -int SugUnGroupLevel(int Level){ - switch (Level){ - case 0: - return 3; - case 1: - return 5; - case 2: - return 6; - default: - return 100; - } -} - -int SugSpell(char *word){ // =0 kui word on oige sona // - int nr; - char outstr2[STEMLEN*4]; - int maxtasand, tagasi; - - maxtasand = tasand; - if (strlen(word)<=1){ - outstr2[0]=0; - } - else{ - chkmin(word, 0, outstr2, &nr, maxtasand, &tagasi); - tagasitasand = tagasi; - } - if (outstr2[0]) - { -//#if defined(SPELLER) - if(mrfFlags.Chk(MF_SPELL) && !IsInExcUDR(word)) -//#endif - return 0; - } -//#if defined(SPELLER) - if(mrfFlags.Chk(MF_SPELL)) - { - tagasitasand=0; - if (IsInUDR(word)) return 0; - } -//#endif - return 1; -} - -void SugAdd(char *word){ - SUG *s, *s1; - char tasanduus; - s=(SUG *)malloc(sizeof(SUG)); - strcpy(s->tyvi, word); - s->sug_next=0; - s->tasand=tagasitasand; - if (!SugAhel){ - SugAhel=s; - s->sug_prev=0; - } - else{ - for (s1=SugAhel; s1->sug_next; s1=s1->sug_next); - s1->sug_next=s; - s->sug_prev=s1; - } - tasanduus=SugUnGroupLevel(SugGroupLevel(tagasitasand)); - if (tasanduustasand1) tasand1=tagasitasand; -// ch[0]=ChToLower(ch[0]); - - if (!SugSpell(word)){ - if (tagasitasand>tasand1) tasand1=tagasitasand; - } - else{ - word[0]=ChToUpper(word[0]); - } - -// if (!SugSpell(ch)){ - if (tagasitasand>tasand1) tasand1=tagasitasand; - } - else{ - ch[0]=ChToUpper(ch[0]); - }// - ch--; - (*ch)=' '; - tagasitasand=tasand1; - SugAdd(word); - return 1; -} - -void SugFree(void){ - SUG *s; - for (s=SugAhel; s; ){ - SugAhel=SugAhel->sug_next; - free(s); - s=SugAhel; - } -} - -int SugCapitalize(char *word){ - char *ch; - int i; - SUG *SA; - if (!ChIsUpper(*word)){ - return 0; - } - for (SA=SugAhel; SA; SA=SA->sug_next){ - SA->tyvi[0]=ChToUpper(SA->tyvi[0]); - } - - i=1; - for (ch=word; *ch; ch++){ - i&=ChIsUpperUP(*ch); - } - if (!i){ - return 0; - } - for (SA=SugAhel; SA; SA=SA->sug_next){ - for (ch=SA->tyvi; *ch; ch++){ - (*ch)=ChToUpper(*ch); - } - } - return 0; -} - -int SugSimilar(char *word1, char *word2){ - int pos; - pos=strlen(word1); - if (abs(pos-strlen(word2))>1) return 0; - if (memcmp(word1, word2, pos)){ - return 0; - } - if (word2[pos]){ - if (strchr(EstAlphas, word2[pos])){ - return 0; - } - } - return 1; -} - -int SugRemoveCopy(){ - SUG *SA, *SA1, *SA2; - for (SA=SugAhel; SA; SA=SA->sug_next){ - for (SA2=SugAhel; SA2; ){ - if (SA==SA2){ - SA2=SA2->sug_next; - continue; - } - if (SugSimilar(SA->tyvi, SA2->tyvi)){ - if (SA2->sug_prev) SA2->sug_prev->sug_next=SA2->sug_next; - if (SA2->sug_next) SA2->sug_next->sug_prev=SA2->sug_prev; - SA1=SA2; - SA2=SA2->sug_next; - if (SA1==SugAhel){ - SugAhel=SA2; - } - free(SA1); - } - else{ - SA2=SA2->sug_next; - } - } - } - return 0; -} - -int SugRemoveImmoderate(){ - SUG *SA, *SB; - int Useful; - if (!SugAhel) return 0; - Useful=SugGroupLevel(SugAhel->tasand); - for (SA=SugAhel; SA; SA=SA->sug_next){ - if (SugGroupLevel(SA->tasand)!=Useful){ - SA->sug_prev->sug_next=0; - SB=SugAhel; - SugAhel=SA; - SugFree(); - SugAhel=SB; - return 0; - } - } - return 0; -} - -int SugOrder(){ - SUG* SA, *SA1, *SA2; - for (SA=SugAhel; SA; SA=SA->sug_next){ - SA1=SA; - for (SA2=SA->sug_next; SA2; SA2=SA2->sug_next){ - if (SA2->tasandtasand){ - SA1=SA2; - } - } - if (SA1!=SA){ - if (SA1->sug_prev) SA1->sug_prev->sug_next=SA1->sug_next; - if (SA1->sug_next) SA1->sug_next->sug_prev=SA1->sug_prev; - SA1->sug_prev=SA->sug_prev; - if (SA->sug_prev) SA->sug_prev->sug_next=SA1; - SA1->sug_next=SA; - SA->sug_prev=SA1; - SA=SA1; - if (!SA->sug_prev) SugAhel=SA; - } - } - return 0; -} - -int SugNext(const char *word, int len){ - SugFree(); - return 0; -} - -int SugFirst(const char *word, int len, int sygavus, int *reserveeritud_tulelvuku_tarbeks){ - int i, j, len2; - char *ch; - char *wordlow, *testword; - char c, tasand1; - struct _ChangeArray *CA1; - struct _ChangeArrayStr *CAStr1; -// char roomanr1=roomanr; -// roomanr=0; - - SugFree(); - - wordlow=(char *)malloc(len+1); - testword=(char *)malloc(len+20); - for (i=0; i<=len; i++){ - wordlow[i]=ChToLower(word[i]); - } - -//#if defined(SPELLER) - if(mrfFlags.Chk(MF_SPELL)) - { - tagasitasand=0; - ch=IsInChangeUDR((char *)word); - if (ch){ - SugAdd(ch); - } - } -//#endif -// SUG_CASE: // - tasand1=sygavus; - tasand=100; - SugCheck(wordlow); - tasand=tasand1; - -// SUG_ABBR: // - strcpy(testword, wordlow); - strcat(testword, "-"); // sona- // - SugCheck(testword); - - ch=strchr(word, '-'); - if ((ch) && (ch!=word)){ - if ((ChIsUpper(*(ch-1))) && (!ChIsUpperUP(*(ch+1)))){ - strcpy(testword, word); // BNS- // - testword[ch-word+1] = 'i'; // BNS-i // - strcpy(testword+(ch-word)+2, ch+1); // BNS-ile // - tasand1=tasand; - tasand=100; // Natuke sygavamalt // - SugCheck(testword); - tasand=tasand1; - } - } - else if (ChIsUpper(*word)){ - ch=(char *)word; - for (i=1; i; ch++, i=ChIsUpper(*ch)); - if (!ChIsUpperUP(*ch)){ - strcpy(testword, word); // BNS // - testword[ch-word] = 'i'; // BNSi // - strcpy(testword+(ch-word)+1, ch); // BNSile // - tasand1=tasand; - tasand=100; // Natuke sygavamalt // - SugCheck(testword); - tasand=tasand1; - } - } - -// SUG_ADDSPACE: Renel oli wordlow word asemel HJK 12.05.98 // - for (i=1; ifrom; CAStr1++){ - if (memcmp(wordlow+i, CAStr1->from, strlen(CAStr1->from))==0){ - memcpy(testword, wordlow, i); - strcpy(testword+i, CAStr1->to); - strcat(testword, wordlow+i+strlen(CAStr1->from)); - SugCheck(testword); - } - } - } - -// SUG_CHANGEENDBLOCKS: // - for (CAStr1=CAEStr; CAStr1->from; CAStr1++){ - len2=strlen(CAStr1->from); - if (len2<=len){ - if (strcmp(wordlow+len-len2, CAStr1->from)==0){ - memcpy(testword, wordlow, len-len2); - strcpy(testword+len-len2, CAStr1->to); - SugCheck(testword); - } - } - } - -// gi* -> *gi // - for (i=3; i<=5; i++){ - if (i>len) break; - if (memcmp(wordlow+len-i, "gi", 2)==0){ - memcpy(testword, wordlow, len-i); - strcpy(testword+len-i, wordlow+len-i+2); - strcpy(testword+len-2, "gi"); - SugCheck(testword); - strcpy(testword+len-2, "ki"); - SugCheck(testword); - } - } - -// SUG_DELLETTERS: // - if (len>1){ - for (i=0; ifrom)||(!CA1->from)){ - for (ch=CA1->to; *ch; ch++){ - testword[i]=*ch; - SugCheck(testword); - } - break; - } - } - } - -// SUG_INSERTLETTERS: // - for (ch=InsertChars; *ch; ch++){ - for (i=1; i +#include +#include +#include + +#include "ini_mrf.h" +#include "soovita.h" +#include "morf.h" +#include "chup.h" +#include "mrflags.h" + +extern MRF_FLAGS mrfFlags; + +char InsertCharsBeg[]="kpstvlmrahnejioü""dfubõg"; // <=0.37: äšöžz // +char InsertChars[]="aeistlunkmodrvghjpäõ""büö"; // <= 0.12: fšžcwy // +char EstAlphas[]="abcdefghijklmnopqrsšzžtuvwõäöüxy"; + +struct _ChangeArray{ + char from; + char *to; +}CA[]={ // kujult, ling, klaviatuurilt // + {'a', "ä" "e" "s"}, + {'b', "" "p" "nvgh"}, + {'c', "" "" "dv"}, + {'d', "" "t" "esr"}, + {'e', "" "a" "sdr"}, + {'f', "" "" "tdrvg"}, + {'g', "" "kjž" "tvhb"}, + {'h', "" "" "ungjb"}, + {'i', "" "j" "uko"}, + {'j', "" "igž" "unkmh"}, + {'k', "" "g" "ilmo"}, + {'l', "" "" "kopö"}, + {'m', "" "" "nkj"}, + {'n', "" "" "mhjb"}, + {'o', "öõ" "u" "ilkp"}, + {'p', "" "b" "loüö"}, + {'q', "" "" "a"}, + {'r', "" "" "etd"}, + {'s', "š" "z" "aed"}, + {'š', "s" "ž" ""}, + {'z', "ž" "s" "a"}, + {'ž', "z" "š" ""}, + {'t', "" "d" "rg"}, + {'u', "ü" "o" "ihj"}, + {'v', "" "" "gb"}, + {'w', "" "" "aes"}, + {'õ', "oö" "" "äü"}, + {'ä', "a" "" "õüö"}, + {'ö', "oõ" "" "lpäü"}, + {'ü', "u" "" "päõö"}, + {'x', "" "" "sd"}, + {'y', "" "ü" "tugh"}, + {'\xa2', "õöo" "" ""}, // o`, o', o^ // + {'\x95', "õöo" "" ""}, + {'\xe4', "õöo" "" ""}, + {'\xa0', "ä""a" "" ""}, // a`, a', a^ // + {'\x85', "ä""a" "" ""}, + {'\x83', "ä""a" "" ""}, + {'\xa3', "üu" "" ""}, // u`, u', u^ // + {'\x97', "üu" "" ""}, + {'\x96', "üu" "" ""}, + {'\0', EstAlphas}, +}; + +struct _ChangeArrayStr{ + char *from; + char *to; +}CAStr[]={ + {"sh", "š"}, + {"zh", "ž"}, + {"x", "ks"}, + {"f", "hv"}, + {"hv", "f"}, + {"ff", "hv"}, + {"mb", "mm"}, + {"ää", "ea"}, + {"g", "dž"}, + {"dž", "g"}, + {"j", "dž"}, + {"dž", "j"}, + {"data", "tada"}, + {"tada", "data"}, + {"o~", "õ"}, + {"a\"", "ä"}, + {"o\"", "ö"}, + {"u\"", "ü"}, + {"s^", "š"}, + {"z^", "ž"}, + {0, 0} +}; + +struct _ChangeArrayStr CAEStr[]={ + {"si", "seid"}, + {"si", "sid"}, + {"seid", "si"}, + {"seid", "sid"}, + {"sid", "si"}, + {"sid", "seid"}, + {0, 0} +}; + +char //roomanr,// tasand, tagasitasand; + +SUG *SugAhel=0; // soovituste ahel // + +// --------------------------------------------------------- // +//#include // + +//#if defined (SPELLER) +char IsInUDR(char *sisse); +char IsInExcUDR(char *sisse); +char *IsInChangeUDR(char *word); +//#endif + +int SugGroupLevel(int Level){ + switch (Level){ + case 0: + case 1: + case 2: + case 3: + return 0; + case 4: + case 5: + return 1; + case 6: + return 2; + default: + return 3; + } +} + +int SugUnGroupLevel(int Level){ + switch (Level){ + case 0: + return 3; + case 1: + return 5; + case 2: + return 6; + default: + return 100; + } +} + +int SugSpell(char *word){ // =0 kui word on oige sona // + int nr; + char outstr2[STEMLEN*4]; + int maxtasand, tagasi; + + maxtasand = tasand; + if (strlen(word)<=1){ + outstr2[0]=0; + } + else{ + chkmin(word, 0, outstr2, &nr, maxtasand, &tagasi); + tagasitasand = tagasi; + } + if (outstr2[0]) + { +//#if defined(SPELLER) + if(mrfFlags.Chk(MF_SPELL) && !IsInExcUDR(word)) +//#endif + return 0; + } +//#if defined(SPELLER) + if(mrfFlags.Chk(MF_SPELL)) + { + tagasitasand=0; + if (IsInUDR(word)) return 0; + } +//#endif + return 1; +} + +void SugAdd(char *word){ + SUG *s, *s1; + char tasanduus; + s=(SUG *)malloc(sizeof(SUG)); + strcpy(s->tyvi, word); + s->sug_next=0; + s->tasand=tagasitasand; + if (!SugAhel){ + SugAhel=s; + s->sug_prev=0; + } + else{ + for (s1=SugAhel; s1->sug_next; s1=s1->sug_next); + s1->sug_next=s; + s->sug_prev=s1; + } + tasanduus=SugUnGroupLevel(SugGroupLevel(tagasitasand)); + if (tasanduustasand1) tasand1=tagasitasand; +// ch[0]=ChToLower(ch[0]); + + if (!SugSpell(word)){ + if (tagasitasand>tasand1) tasand1=tagasitasand; + } + else{ + word[0]=ChToUpper(word[0]); + } + +// if (!SugSpell(ch)){ + if (tagasitasand>tasand1) tasand1=tagasitasand; + } + else{ + ch[0]=ChToUpper(ch[0]); + }// + ch--; + (*ch)=' '; + tagasitasand=tasand1; + SugAdd(word); + return 1; +} + +void SugFree(void){ + SUG *s; + for (s=SugAhel; s; ){ + SugAhel=SugAhel->sug_next; + free(s); + s=SugAhel; + } +} + +int SugCapitalize(char *word){ + char *ch; + int i; + SUG *SA; + if (!ChIsUpper(*word)){ + return 0; + } + for (SA=SugAhel; SA; SA=SA->sug_next){ + SA->tyvi[0]=ChToUpper(SA->tyvi[0]); + } + + i=1; + for (ch=word; *ch; ch++){ + i&=ChIsUpperUP(*ch); + } + if (!i){ + return 0; + } + for (SA=SugAhel; SA; SA=SA->sug_next){ + for (ch=SA->tyvi; *ch; ch++){ + (*ch)=ChToUpper(*ch); + } + } + return 0; +} + +int SugSimilar(char *word1, char *word2){ + int pos; + pos=strlen(word1); + if (abs(pos-strlen(word2))>1) return 0; + if (memcmp(word1, word2, pos)){ + return 0; + } + if (word2[pos]){ + if (strchr(EstAlphas, word2[pos])){ + return 0; + } + } + return 1; +} + +int SugRemoveCopy(){ + SUG *SA, *SA1, *SA2; + for (SA=SugAhel; SA; SA=SA->sug_next){ + for (SA2=SugAhel; SA2; ){ + if (SA==SA2){ + SA2=SA2->sug_next; + continue; + } + if (SugSimilar(SA->tyvi, SA2->tyvi)){ + if (SA2->sug_prev) SA2->sug_prev->sug_next=SA2->sug_next; + if (SA2->sug_next) SA2->sug_next->sug_prev=SA2->sug_prev; + SA1=SA2; + SA2=SA2->sug_next; + if (SA1==SugAhel){ + SugAhel=SA2; + } + free(SA1); + } + else{ + SA2=SA2->sug_next; + } + } + } + return 0; +} + +int SugRemoveImmoderate(){ + SUG *SA, *SB; + int Useful; + if (!SugAhel) return 0; + Useful=SugGroupLevel(SugAhel->tasand); + for (SA=SugAhel; SA; SA=SA->sug_next){ + if (SugGroupLevel(SA->tasand)!=Useful){ + SA->sug_prev->sug_next=0; + SB=SugAhel; + SugAhel=SA; + SugFree(); + SugAhel=SB; + return 0; + } + } + return 0; +} + +int SugOrder(){ + SUG* SA, *SA1, *SA2; + for (SA=SugAhel; SA; SA=SA->sug_next){ + SA1=SA; + for (SA2=SA->sug_next; SA2; SA2=SA2->sug_next){ + if (SA2->tasandtasand){ + SA1=SA2; + } + } + if (SA1!=SA){ + if (SA1->sug_prev) SA1->sug_prev->sug_next=SA1->sug_next; + if (SA1->sug_next) SA1->sug_next->sug_prev=SA1->sug_prev; + SA1->sug_prev=SA->sug_prev; + if (SA->sug_prev) SA->sug_prev->sug_next=SA1; + SA1->sug_next=SA; + SA->sug_prev=SA1; + SA=SA1; + if (!SA->sug_prev) SugAhel=SA; + } + } + return 0; +} + +int SugNext(const char *word, int len){ + SugFree(); + return 0; +} + +int SugFirst(const char *word, int len, int sygavus, int *reserveeritud_tulelvuku_tarbeks){ + int i, j, len2; + char *ch; + char *wordlow, *testword; + char c, tasand1; + struct _ChangeArray *CA1; + struct _ChangeArrayStr *CAStr1; +// char roomanr1=roomanr; +// roomanr=0; + + SugFree(); + + wordlow=(char *)malloc(len+1); + testword=(char *)malloc(len+20); + for (i=0; i<=len; i++){ + wordlow[i]=ChToLower(word[i]); + } + +//#if defined(SPELLER) + if(mrfFlags.Chk(MF_SPELL)) + { + tagasitasand=0; + ch=IsInChangeUDR((char *)word); + if (ch){ + SugAdd(ch); + } + } +//#endif +// SUG_CASE: // + tasand1=sygavus; + tasand=100; + SugCheck(wordlow); + tasand=tasand1; + +// SUG_ABBR: // + strcpy(testword, wordlow); + strcat(testword, "-"); // sona- // + SugCheck(testword); + + ch=strchr(word, '-'); + if ((ch) && (ch!=word)){ + if ((ChIsUpper(*(ch-1))) && (!ChIsUpperUP(*(ch+1)))){ + strcpy(testword, word); // BNS- // + testword[ch-word+1] = 'i'; // BNS-i // + strcpy(testword+(ch-word)+2, ch+1); // BNS-ile // + tasand1=tasand; + tasand=100; // Natuke sygavamalt // + SugCheck(testword); + tasand=tasand1; + } + } + else if (ChIsUpper(*word)){ + ch=(char *)word; + for (i=1; i; ch++, i=ChIsUpper(*ch)); + if (!ChIsUpperUP(*ch)){ + strcpy(testword, word); // BNS // + testword[ch-word] = 'i'; // BNSi // + strcpy(testword+(ch-word)+1, ch); // BNSile // + tasand1=tasand; + tasand=100; // Natuke sygavamalt // + SugCheck(testword); + tasand=tasand1; + } + } + +// SUG_ADDSPACE: Renel oli wordlow word asemel HJK 12.05.98 // + for (i=1; ifrom; CAStr1++){ + if (memcmp(wordlow+i, CAStr1->from, strlen(CAStr1->from))==0){ + memcpy(testword, wordlow, i); + strcpy(testword+i, CAStr1->to); + strcat(testword, wordlow+i+strlen(CAStr1->from)); + SugCheck(testword); + } + } + } + +// SUG_CHANGEENDBLOCKS: // + for (CAStr1=CAEStr; CAStr1->from; CAStr1++){ + len2=strlen(CAStr1->from); + if (len2<=len){ + if (strcmp(wordlow+len-len2, CAStr1->from)==0){ + memcpy(testword, wordlow, len-len2); + strcpy(testword+len-len2, CAStr1->to); + SugCheck(testword); + } + } + } + +// gi* -> *gi // + for (i=3; i<=5; i++){ + if (i>len) break; + if (memcmp(wordlow+len-i, "gi", 2)==0){ + memcpy(testword, wordlow, len-i); + strcpy(testword+len-i, wordlow+len-i+2); + strcpy(testword+len-2, "gi"); + SugCheck(testword); + strcpy(testword+len-2, "ki"); + SugCheck(testword); + } + } + +// SUG_DELLETTERS: // + if (len>1){ + for (i=0; ifrom)||(!CA1->from)){ + for (ch=CA1->to; *ch; ch++){ + testword[i]=*ch; + SugCheck(testword); + } + break; + } + } + } + +// SUG_INSERTLETTERS: // + for (ch=InsertChars; *ch; ch++){ + for (i=1; i