Skip to content

Commit

Permalink
Merge pull request #27 from clarin-eric/data
Browse files Browse the repository at this point in the history
Data
  • Loading branch information
matyaskopp authored Jul 28, 2023
2 parents 375c3af + 4776910 commit ea73068
Show file tree
Hide file tree
Showing 1,246 changed files with 1,515,792 additions and 720,769 deletions.
2 changes: 1 addition & 1 deletion .github/actions/ParlaMintStatus/status.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ echo "DEBUG: parla_process=${parla_process}"
echo "parla_process=${parla_process}" >> $GITHUB_OUTPUT
echo "parla_all=${parla_all}" >> $GITHUB_OUTPUT
echo "parla_changed=${parla_changed}" >> $GITHUB_OUTPUT
echo "scripts_changed=${scripts_changed}" | tr "\n" " " >> $GITHUB_OUTPUT
echo "scripts_changed=${scripts_changed}" | tr "\n" " " | sed "s/$/\n/" >> $GITHUB_OUTPUT
echo "parla_changed_size=${parla_changed_size}" >> $GITHUB_OUTPUT
28 changes: 17 additions & 11 deletions .github/actions/ParlaMintValidate/validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,37 @@ for parla in $(jq -r '.[]' <<< $1 ); do
echo "Cleaning old sample files [$parla]"
rm -f ${DATADIR}/ParlaMint-$parla/ParlaMint-*.{txt,tsv,conllu,vert}

Scripts/validate-parlamint.pl Schema ${DATADIR}/ParlaMint-$parla 2>&1 | sed "s/^\(.*\)\(error\)/::error::\1\2/i" | tee $DIR/validate.log
if [ -f "${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.xml" ] ; then

echo "Validating parla-CLARIN (TEI)"
java -jar /usr/share/java/saxon.jar -xi -xsl:Scripts/copy.xsl ${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.xml > $TESTDIR/ParlaMint-$parla.xml
java -jar /usr/share/java/jing.jar Schema/parla-clarin.rng $TESTDIR/ParlaMint-$parla.xml| sed "s/^\(.*\)\(error\)/::error::\1\2/i" | tee $DIR/parla-clarin-validate-tei.log
( Scripts/validate-parlamint.pl Schema ${DATADIR}/ParlaMint-$parla 2>&1 || echo "ERROR: validate-parlamint.pl exited with <> 0" ) \
| sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i" | tee $DIR/validate.log

echo "CONVERT to text and metadata"
Scripts/parlamintp-tei2text.pl ${DATADIR}/ParlaMint-$parla $DIR 2>&1 | sed "s/^\(.*\)\(error\)/::error::\1\2/i" | tee $DIR/text.log
echo "Validating parla-CLARIN (TEI)"
java -jar /usr/share/java/saxon.jar -xi -xsl:Scripts/copy.xsl ${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.xml > $TESTDIR/ParlaMint-$parla.xml
java -jar /usr/share/java/jing.jar Schema/parla-clarin.rng $TESTDIR/ParlaMint-$parla.xml| sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i" | tee $DIR/parla-clarin-validate-tei.log

echo "CONVERT to text and metadata"
( Scripts/parlamintp-tei2text.pl ${DATADIR}/ParlaMint-$parla $DIR 2>&1 || echo "ERROR: parlamintp-tei2text.pl exited with <> 0" ) \
| sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i" | tee $DIR/text.log
else
echo "::warning::skipping TEI version validation - missing corpus root file ParlaMint-$parla/ParlaMint-$parla.xml"
fi

if [ -f "${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.ana.xml" ] ; then
echo "Validating parla-CLARIN (TEI.ana)"
java -jar /usr/share/java/saxon.jar -xi -xsl:Scripts/copy.xsl ${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.ana.xml > $TESTDIR/ParlaMint-$parla.ana.xml
java -jar /usr/share/java/jing.jar Schema/parla-clarin.rng $TESTDIR/ParlaMint-$parla.ana.xml | sed "s/^\(.*\)\(error\)/::error::\1\2/i" | tee $DIR/parla-clarin-validate-tei.log
java -jar /usr/share/java/jing.jar Schema/parla-clarin.rng $TESTDIR/ParlaMint-$parla.ana.xml | sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i" | tee $DIR/parla-clarin-validate-tei.log

echo "CONVERT to vert"
Scripts/parlamint-tei2vert.pl ${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.ana.xml $DIR 2>&1 | tee $DIR/vert.log | sed "s/^\(.*\)\(error\)/::error::\1\2/i"
Scripts/parlamint-tei2vert.pl ${DATADIR}/ParlaMint-$parla/ParlaMint-$parla.ana.xml $DIR 2>&1 | tee $DIR/vert.log | sed "s/^\(.*\)\(\berror\b\)/::error::\1\2/i"

echo "CONVERT and VALIDATE CoNLLu format"
Scripts/parlamint2conllu.pl ${DATADIR}/ParlaMint-$parla $DIR 2>&1 \
| perl -pe 'if(/^INFO/){($L) = $_ =~ m/Validating level (\d):/;} $ERROR= $L>1 ? "warning" : "error"; s/^(.*)(error)/\:\:$ERROR\:\:$1$2/i;' \
( Scripts/parlamint2conllu.pl ${DATADIR}/ParlaMint-$parla $DIR 2>&1 || echo "ERROR: parlamint2conllu.pl exited with <> 0" ) \
| perl -pe '$s //= {}; if(/^INFO/){($L) = $_ =~ m/Validating level (\d):/;} $ERROR= ($L>1 && !/morpho/i) ? "warning" : "error"; s/^(.*)(\berrors?\b)/\:\:$ERROR\:\:$1$2/i; if($seen{m/\[L2[^\]]*\]/}){s/^/\:\:$ERROR\:\:/}; m/\[(L2[^\]]*)\]/; if($1 && !$s->{$1}){$s->{$1}=1;s/^/\:\:$ERROR\:\:(1st of this type)/;}' \
| tee $DIR/conllu.log

else
echo "::warning::skipping annotated version validation - missing corpus root file"
echo "::warning::skipping TEI.ana version validation - missing corpus root file ParlaMint-$parla/ParlaMint-$parla.ana.xml"
fi

echo "Move new files to ParlaMint-$parla"
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/createSample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@ jobs:
- name: Remove unused data from repository
run: |
cd $GITHUB_WORKSPACE/ParlaMint
shopt -s globstar
for parla in $(jq -r '.[]' <<< '${{needs.Changes.outputs.parla_changed}}' ); do
for file in $(echo Data/ParlaMint-$parla/ParlaMint-$parla\_*) ; do
for file in $(echo Data/ParlaMint-$parla/**/ParlaMint-$parla\_*) ; do
echo "testing $file"
xmlfile=$(echo $file|sed -E 's/(-meta\.tsv|\.conllu|\.txt|\.vert)$/.xml/;s/^.*\///')
xmlfile=$(echo $file|sed -E 's/(-meta\.tsv|\.conllu|\.txt|\.vert)$/.xml/;s/^Data\/ParlaMint-[^\/]*\///')
cat Data/ParlaMint-$parla/ParlaMint-$parla{,.ana}.xml | grep -Fq "$xmlfile" || git rm "$file"
done
done
Expand All @@ -86,7 +87,7 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/ParlaMint
for parla in $(jq -r '.[]' <<< '${{needs.Changes.outputs.parla_changed}}' ); do
git add Data/ParlaMint-$parla/ParlaMint-*.{txt,tsv,conllu,vert} || echo "::warning:: $parla suppress fatal: pathspec '<FILE>' did not match any files"
git add Data/ParlaMint-$parla/ParlaMint-*.{txt,tsv,conllu,vert,xml} || echo "::warning:: $parla suppress fatal: pathspec '<FILE>' did not match any files"
git diff --name-only Data/ParlaMint-$parla
done
git status
Expand Down
Loading

0 comments on commit ea73068

Please sign in to comment.