From 0c449a8b6deba7fa1d2ccbeff6ab92c4652291e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?=
Date: Wed, 31 May 2023 09:56:23 +0200
Subject: [PATCH 001/111] fix getting component files when bit xi prefix is
used (related to #564)
---
Scripts/parlamint-tei2vert.pl | 20 +++++++-------------
1 file changed, 7 insertions(+), 13 deletions(-)
diff --git a/Scripts/parlamint-tei2vert.pl b/Scripts/parlamint-tei2vert.pl
index e891e8d3a..a7baeaff4 100755
--- a/Scripts/parlamint-tei2vert.pl
+++ b/Scripts/parlamint-tei2vert.pl
@@ -16,23 +16,17 @@
$Saxon = 'java -jar /usr/share/java/saxon.jar';
$TEI2VERT = "$Bin/parlamint2xmlvert.xsl";
$POLISH = "$Bin/parlamint-xml2vert.pl";
+$Includes = "$Bin/get-includes.xsl";
die "Can't find root TEI file with teiHeader: $rootFile\n"
unless -e $rootFile;
-open(IN, '<:utf8', $rootFile);
-$/ = ">";
-$skip = 1; # We skip over XIncludes in the
-while () {
- if (m||) {$skip = 0}
- elsif ($skip) {}
- elsif (m|
Date: Thu, 1 Jun 2023 16:41:25 +0200
Subject: [PATCH 002/111] add target text.ana-XX for generating
ParlaMint-*.ana.txt files (which should be identical with ParlaMint-*.txt
files generated with text-XX target)
---
Makefile | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index 0c840e617..06300e46e 100644
--- a/Makefile
+++ b/Makefile
@@ -253,14 +253,23 @@ $(chars-XX): chars-%: %
text-XX = $(addprefix text-, $(PARLIAMENTS))
-## text ## create text version from tei files
+## text ## create text version from TEI files
text: $(text-XX)
-## text-XX ## convert tei files to text
+## text-XX ## convert TEI files to text
$(text-XX): text-%: %
rm -f ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.txt
find ${DATADIR} -type f -path "${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/*" -name "ParlaMint-$<_*.xml" | grep -v '.ana.' | $P --jobs 10 \
'$s -xsl:Scripts/parlamint-tei2text.xsl {} > ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/{/.}.txt'
+text.ana-XX = $(addprefix text.ana-, $(PARLIAMENTS))
+## text ## create text version from TEI.ana files
+text.ana: $(text.ana-XX)
+## text-XX ## convert TEI.ana files to text
+$(text.ana-XX): text.ana-%: %
+ rm -f ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.txt
+ find ${DATADIR} -type f -path "${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/*" -name "ParlaMint-$<_*.xml" | grep '.ana.' | $P --jobs 10 \
+ '$s -xsl:Scripts/parlamint-tei2text.xsl {} > ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/{/.}.txt'
+
meta-XX = $(addprefix meta-, $(PARLIAMENTS))
From d9447f42cfe7bd0ff6e8d33017ca6987ec1030e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?=
Date: Thu, 1 Jun 2023 17:09:21 +0200
Subject: [PATCH 003/111] [devel e40dbff4] add target text.ana-XX for
generating ParlaMint-*.ana.txt files (which should be identical with
ParlaMint-*.txt files generated with text-XX target)
---
Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index 06300e46e..4e9c61355 100644
--- a/Makefile
+++ b/Makefile
@@ -257,7 +257,7 @@ text-XX = $(addprefix text-, $(PARLIAMENTS))
text: $(text-XX)
## text-XX ## convert TEI files to text
$(text-XX): text-%: %
- rm -f ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.txt
+ rm -f `ls ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.txt | grep -v '.ana.'`
find ${DATADIR} -type f -path "${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/*" -name "ParlaMint-$<_*.xml" | grep -v '.ana.' | $P --jobs 10 \
'$s -xsl:Scripts/parlamint-tei2text.xsl {} > ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/{/.}.txt'
@@ -266,7 +266,7 @@ text.ana-XX = $(addprefix text.ana-, $(PARLIAMENTS))
text.ana: $(text.ana-XX)
## text-XX ## convert TEI.ana files to text
$(text.ana-XX): text.ana-%: %
- rm -f ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.txt
+ rm -f ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/ParlaMint-$<_*.ana.txt
find ${DATADIR} -type f -path "${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/*" -name "ParlaMint-$<_*.xml" | grep '.ana.' | $P --jobs 10 \
'$s -xsl:Scripts/parlamint-tei2text.xsl {} > ${DATADIR}/ParlaMint-$<${CORPUSDIR_SUFFIX}/{/.}.txt'
From 8fe48c6590a2c3e7d4337383e4a9ebd180b25b13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toma=C5=BE=20Erjavec?=
Date: Thu, 1 Jun 2023 17:44:01 +0200
Subject: [PATCH 004/111] Add scripts for processing of MTed corpora.
---
Scripts/conllu2tei.pl | 215 +++++++++++++++++
Scripts/mt-conllu2tei.pl | 60 +++++
Scripts/mt-csv2tsv.pl | 49 ++++
Scripts/mt-insert-notes.xsl | 131 ++++++++++
Scripts/mt-insert-s.pl | 47 ++++
Scripts/mt-prepare4mt.xsl | 465 ++++++++++++++++++++++++++++++++++++
6 files changed, 967 insertions(+)
create mode 100755 Scripts/conllu2tei.pl
create mode 100755 Scripts/mt-conllu2tei.pl
create mode 100755 Scripts/mt-csv2tsv.pl
create mode 100644 Scripts/mt-insert-notes.xsl
create mode 100755 Scripts/mt-insert-s.pl
create mode 100644 Scripts/mt-prepare4mt.xsl
diff --git a/Scripts/conllu2tei.pl b/Scripts/conllu2tei.pl
new file mode 100755
index 000000000..4d08c7b89
--- /dev/null
+++ b/Scripts/conllu2tei.pl
@@ -0,0 +1,215 @@
+#!/usr/bin/perl
+# Convert CoNLL-U file to TEI
+# This is for ParlaMint slightly modified script from
+# https://github.com/clarinsi/TEI-conversions/blob/645dfbece8f52b45a51f159f5874e1038f9f1c12/Scripts/conllu2tei.pl
+
+use warnings;
+use utf8;
+binmode STDERR, 'utf8';
+binmode STDIN, 'utf8';
+binmode STDOUT, 'utf8';
+
+# Extended TEI prefixes to use on annotation
+$ud_prefix = 'ud-syn'; # Prefix for syntactic roles
+$ud_type = 'UD-SYN'; # Type of syntatic dependencies
+
+# ID prefixes
+$doc_prefix = 'doc'; # Prefix for document IDs, if they are numeric in source
+$p_prefix = 'p'; # Prefix for paragraph IDs, if they are numeric in source
+$s_prefix = 's'; # Prefix for sentence IDs, if they are numeric or do not exist in source
+
+print "\n";
+$has_div = 0;
+$has_p = 0;
+$has_s = -1; #Means this is the first sentence
+$doc_n = 0;
+$p_n = 0;
+$s_n = 0;
+
+$/ = "\n\n";
+while (<>) {
+ if (m|# newdoc id = (.+)|) {
+ if (m|# newpar id|) {$has_p = 1}
+ $doc_id = $1;
+ $has_div = 1;
+ $s_n = 0;
+ if ($has_div) {
+ if ($has_p) {print "
\n"}
+ else {print "\n"}
+ print "\n";
+ }
+ if ($doc_id =~ /^\d/) {
+ $doc_n = $doc_id;
+ $doc_id = $doc_prefix . $doc_n
+ }
+ else {$doc_n++}
+ print "\n";
+ unless ($has_p) {print "
\n"}
+ $has_p = 0;
+ }
+ if (m|# newpar id = (.+)|) {
+ $p_id = $1;
+ if ($has_p) {print "\n"}
+ $has_p = 1;
+ $p_n++;
+ $s_n = 0;
+ if ($p_id =~ /^\d/) {
+ $p_id = $p_prefix . $p_n
+ }
+ print "\n";
+ }
+ if (m|# sent_id = (.+)|) {
+ $has_s = 1;
+ $s_id = $1;
+ $s_n++;
+ if ($s_id =~ /^\d/) {
+ $s_id = "$p_id.$s_prefix$s_n";
+ }
+ }
+ else {
+ print "\n" if $has_s == -1;
+ $has_s = 0;
+ $s_n++;
+ $s_id = "$s_prefix$s_n";
+ }
+ print conllu2tei($s_id, $s_n, $_);
+}
+if ($has_p) {print "
\n"}
+if ($has_div) {print " \n"}
+print "\n";
+
+#Convert one sentence into TEI
+sub conllu2tei {
+ my $id = shift;
+ my $n = shift;
+ my $conllu = shift;
+ my $tei;
+ my $tag;
+ my $element;
+ my $space;
+ my $ner_prev;
+ my $ner;
+ my @ids = ();
+ my @toks = ();
+ my @deps = ();
+ $tei = "";
+ foreach my $line (split(/\n/, $conllu)) {
+ next unless $line =~ /^\d+\t/;
+ chomp;
+ my ($n, $token, $lemma, $upos, $xpos, $ufeats, $link, $role, $extra, $local)
+ = split /\t/, $line;
+ # Don't know how to do syntactic words yet
+ # if ($n =~ m|(\d+)-(\d+)|) {
+ # $from = $1;
+ # $to = $2
+ # }
+ $xpos =~ s/-+$//; # Get rid of trailing dashes sometimes introduced by Stanford NLP
+
+ if ($token =~ /^[[:punct:]]+$/) {
+ $tag = 'pc';
+ if ($upos ne '_') {
+ # print STDERR "WARN: changing PoS to punctuation for\n$line\n"
+ # unless ($xpos eq '_' or $xpos eq 'Z')
+ # and ($upos eq 'PUNCT' or $upos eq 'SYM');
+ if ($token =~ /[$%§©+−×÷=<>]/) {$upos = 'SYM'}
+ else {$upos = 'PUNCT'}
+ $ufeats = '_';
+ }
+ $xpos = 'Z' unless $xpos eq '_';
+ }
+ else {$tag = 'w'}
+
+ if ($upos !~ /_/) {
+ $feats = "UPosTag=$upos";
+ $feats .= "|$ufeats" if $ufeats ne '_';
+ }
+
+ #Bug in STANZA:
+ if ($role eq '') {$role = 'dep'}
+
+ if (($ner) = $local =~ /NER=([A-Z-]+)/) {
+ if (my ($type) = $ner =~ /^B-(.+)/) {
+ if ($ner_prev and $ner_prev ne 'O') {
+ push(@toks, "")
+ }
+ push(@toks, "");
+ }
+ #Sometimes NER begins with I! (bug in CLASSLA)
+ elsif (my ($type) = $ner =~ /^I-(.+)/) {
+ if (not($ner_prev) or $ner_prev eq 'O') {
+ push(@toks, "");
+ }
+ }
+ elsif ($ner eq 'O' and $ner_prev and $ner_prev ne 'O') {
+ push(@toks, "")
+ }
+ $ner_prev = $ner
+ }
+
+ $space = $local !~ s/SpaceAfter=No//;
+ $token = &xml_encode($token);
+ $xpos = &xml_encode($xpos);
+ $xpos =~ s/"/"/g;
+ $lemma = &xml_encode($lemma);
+ $lemma =~ s/"/"/g;
+ if ($tag eq 'w') {$element = "<$tag>$token$tag>"}
+ elsif ($tag eq 'pc') {$element = "<$tag>$token$tag>"}
+ if ($xpos ne '_') {$element =~ s|>| pos=\"$xpos\">|}
+ if ($feats and $feats ne '_') {$element =~ s|>| msd=\"$feats\">|}
+ if ($tag eq 'w') {
+ if ($lemma eq '_') {
+ print STDERR "WARN: changing empty lemma to $token for\n$line\n";
+ $lemma = $token
+ }
+ $element =~ s|>| lemma=\"$lemma\">|
+ }
+ $element =~ s|>| join="right">| unless $space;
+ push @ids, $id . '.t' . $n;
+ push @toks, $element;
+ push @deps, "$link\t$n\t$role" #Only if we have a parse
+ if $role ne '_';
+ }
+ #Give IDs to tokens
+ foreach my $id (@ids) {
+ $element = '';
+ #We can have a tags here, skip them for IDs
+ while ($element !~ m|')
+ }
+ if (@deps) {
+ $tei .= "\n";
+ foreach $dep (@deps) {
+ my ($head, $arg, $role) = split /\t/, $dep;
+ $head_id = $id; #if 0 points to sentence id
+ $head_id .= '.t' . $head if $head;
+ $arg_id = $id . '.t' . $arg;
+ $tei .= " \n";
+ }
+ $tei .= "";
+ }
+ $tei .= "\n\n";
+ return $tei
+}
+
+sub xml_encode {
+ my $str = shift;
+ $str =~ s|&|&|g;
+ $str =~ s|<|<|g;
+ $str =~ s|>|>|g;
+ #Don't really want to do it for content
+ #$str =~ s|"|"|g;
+ return $str
+}
diff --git a/Scripts/mt-conllu2tei.pl b/Scripts/mt-conllu2tei.pl
new file mode 100755
index 000000000..cdee80520
--- /dev/null
+++ b/Scripts/mt-conllu2tei.pl
@@ -0,0 +1,60 @@
+#!/usr/bin/perl
+# Convert CoNLL-U file to TEI
+use warnings;
+use utf8;
+use FindBin qw($Bin);
+use File::Spec;
+use File::Copy;
+use File::Copy::Recursive qw(dircopy);
+use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
+my $tempdirroot = "$Bin/tmp";
+my $tmpDir = tempdir(DIR => $tempdirroot, CLEANUP => 1);
+
+binmode STDERR, 'utf8';
+$inDir = shift;
+$notesFile = shift;
+$conllDir = shift;
+$outDir = shift;
+
+($country) = $inDir =~ m|-([A-Z]{2}(-[A-Z]{2})?)\.| or
+ die "Strange input directory $inDir!\n";
+
+$notesFile = File::Spec->rel2abs($notesFile);
+$inTEI = File::Spec->rel2abs($inDir);
+$conllDir = File::Spec->rel2abs($conllDir);
+$outDir = File::Spec->rel2abs($outDir);
+
+$saxon = "java -jar -Xmx240g /usr/share/java/saxon.jar";
+$scriptStripSents = "$Bin/mt-prepare4mt.xsl";
+$scriptConllu2Tei = "$Bin/conllu2tei.pl";
+$scriptInsertNotes = "$Bin/mt-insert-notes.xsl";
+$scriptInsertSents = "$Bin/mt-insert-s.pl";
+$scriptPolish = "$Bin/polish-xml.pl";
+
+print STDERR "INFO: Preparing data for $country\n";
+$tmpTEI = "$tmpDir/ParlaMint-XX.tmp";
+mkdir $tmpTEI unless -d $tmpTEI;
+# In $tmpTEI/ make corpus with empty sentences
+`$saxon outDir=$tmpTEI -xsl:$scriptStripSents $inTEI`;
+mkdir $outDir unless -d $outDir;
+`cp $tmpTEI/*.xml $outDir`;
+
+foreach $yearDir (glob "$tmpTEI/*") {
+ next unless -d $yearDir;
+ ($year) = $yearDir =~ m|/(\d\d\d\d)$| or die "Strange $yearDir\n";
+ print STDERR "INFO: Processing $country $year\n";
+ `mkdir $outDir/$year` unless -d "$outDir/$year";
+ foreach $inFile (glob "$tmpTEI/$year/*.xml") {
+ ($fName) = $inFile =~ m|/([^/]+)\.ana\.xml|;
+ $tmpFile1 = "$tmpDir/$fName.body.xml";
+ $tmpFile2 = "$tmpDir/$fName.note.xml";
+ $conllFile = "$conllDir/$year/$fName.conllu";
+ $conllFile =~ s|-en||;
+ die "Cant find ConLL-U file $conllFile\n" unless -e $conllFile;
+ $outFile = "$outDir/$year/$fName.ana.xml";
+ print STDERR "INFO: Processing $year/$fName\n";
+ `$scriptConllu2Tei < $conllFile > $tmpFile1`;
+ `$saxon notesFile=$notesFile -xsl:$scriptInsertNotes $inFile > $tmpFile2`;
+ `$scriptInsertSents $tmpFile1 < $tmpFile2 | $scriptPolish > $outFile`;
+ }
+}
diff --git a/Scripts/mt-csv2tsv.pl b/Scripts/mt-csv2tsv.pl
new file mode 100755
index 000000000..d2f8a5ff4
--- /dev/null
+++ b/Scripts/mt-csv2tsv.pl
@@ -0,0 +1,49 @@
+#!/usr/bin/perl
+# From MTed csv file + CoNLL-U file make TSV with sentence ID and translation
+# Tomaž Erjavec
+# License: GNU GPL
+
+use warnings;
+use utf8;
+binmode(STDIN, ':utf8');
+binmode(STDOUT, ':utf8');
+binmode(STDERR, ':utf8');
+
+$inDirs = shift;
+$csv_ext = '.eng.csv';
+$conll_ext = '.conllu';
+$tsv_ext = '-en.tsv';
+
+foreach my $csv_file (glob "$inDirs/*$csv_ext $inDirs/*/*$csv_ext") {
+ print STDERR "INFO: Doing file $csv_file\n";
+ ($fName) = $csv_file =~ m|(.+)\Q$csv_ext\E|;
+ $conll_file = $fName . $conll_ext;
+ die "Cant find $conll_file!\n" unless -e $conll_file;
+ $tsv_file = "$fName$tsv_ext";
+ open(META, '<:utf8', $conll_file) or die "Cant find $conll_file!\n";
+ @sents = ();
+ while () {
+ chomp;
+ if (m|# sent_id = (.+)|) {
+ push(@sents, $1)
+ }
+ }
+ close META;
+ open(OUT, '>:utf8', $tsv_file) or die "Cant open $tsv_file!\n";
+ open(IN, '<:utf8', $csv_file) or die "Cant find $csv_file!\n";
+ while () {
+ next if /^file/;
+ chomp;
+ ($text) = /.+?,.+?,(.+)/;
+ die "TAB in text $text!\n" if $text =~ /\t/;
+ $text =~ s/^"//;
+ $text =~ s/"$//;
+ $text =~ s/""/"/g;
+ die "No more sentence IDs!\n" unless @sents;
+ $id = shift @sents;
+ print OUT "$id\t$text\n"
+ }
+ close IN;
+ close OUT;
+ die "Too many sentence IDs!\n" if @sents;
+}
diff --git a/Scripts/mt-insert-notes.xsl b/Scripts/mt-insert-notes.xsl
new file mode 100644
index 000000000..b1fe302e7
--- /dev/null
+++ b/Scripts/mt-insert-notes.xsl
@@ -0,0 +1,131 @@
+
+
+
+
+
+ en
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Scripts/mt-insert-s.pl b/Scripts/mt-insert-s.pl
new file mode 100755
index 000000000..9c9fde4c4
--- /dev/null
+++ b/Scripts/mt-insert-s.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/perl
+# Insert annotated sentences into skeleton TEI
+# Tomaž Erjavec
+# License: GNU GPL
+
+use warnings;
+use utf8;
+binmode(STDIN, ':utf8');
+binmode(STDOUT, ':utf8');
+binmode(STDERR, ':utf8');
+
+$sentFile = shift;
+
+open(TBL, '<:utf8', $sentFile) or die "Cant find $sentFile!\n";
+$/ = "\n";
+while () {
+ next unless m||;
+ s|.+) {
+ if (m|| corresp="mt-src:$id1">|;
+ print $sent;
+ }
+ else {
+ print;
+ }
+}
+sub clean {
+ my $content = shift;
+ my $prev_content = shift;
+ $content = $prev_content . " " . $content;
+ $content =~ s|<.+?>||sg; #Remove any markup from comment, like