Skip to content

Commit

Permalink
Merge pull request #28036 from roberth/frog
Browse files Browse the repository at this point in the history
frog: init at v0.13.7
  • Loading branch information
Mic92 committed Aug 26, 2017
2 parents 2d43c1f + c5ec8cf commit 58dc4a8
Show file tree
Hide file tree
Showing 23 changed files with 516 additions and 0 deletions.
53 changes: 53 additions & 0 deletions pkgs/development/libraries/languagemachines/frog.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines
}:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-frog.json);
in

stdenv.mkDerivation {
name = "frog";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "frog-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2 icu
languageMachines.ticcutils
languageMachines.timbl
languageMachines.mbt
languageMachines.libfolia
languageMachines.ucto
languageMachines.frogdata
];

preConfigure = ''
sh bootstrap.sh
'';
postInstall = ''
# frog expects the data files installed in the same prefix
mkdir -p $out/share/frog/;
for f in ${languageMachines.frogdata}/share/frog/*; do
ln -s $f $out/share/frog/;
done;
make check
'';

meta = with stdenv.lib; {
description = "A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
homepage = https://languagemachines.github.io/frog;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];

longDescription = ''
Frog is an integration of memory-based natural language processing (NLP) modules developed for Dutch. All NLP modules are based on Timbl, the Tilburg memory-based learning software package. Most modules were created in the 1990s at the ILK Research Group (Tilburg University, the Netherlands) and the CLiPS Research Centre (University of Antwerp, Belgium). Over the years they have been integrated into a single text processing tool, which is currently maintained and developed by the Language Machines Research Group and the Centre for Language and Speech Technology at Radboud University Nijmegen. A dependency parser, a base phrase chunker, and a named-entity recognizer module were added more recently. Where possible, Frog makes use of multi-processor support to run subtasks in parallel.
Various (re)programming rounds have been made possible through funding by NWO, the Netherlands Organisation for Scientific Research, particularly under the CGN project, the IMIX programme, the Implicit Linguistics project, the CLARIN-NL programme and the CLARIAH programme.
'';
};

}
31 changes: 31 additions & 0 deletions pkgs/development/libraries/languagemachines/frogdata.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines
}:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-frogdata.json);
in

stdenv.mkDerivation {
name = "frogdata";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "frogdata-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
];

preConfigure = ''
sh bootstrap.sh
'';

meta = with stdenv.lib; {
description = "Data for Frog, a Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
homepage = https://languagemachines.github.io/frog;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
};

}
30 changes: 30 additions & 0 deletions pkgs/development/libraries/languagemachines/libfolia.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines }:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-libfolia.json);
in

stdenv.mkDerivation {
name = "libfolia";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "libfolia-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive libxml2 icu languageMachines.ticcutils ];
preConfigure = "sh bootstrap.sh";

meta = with stdenv.lib; {
description = "A C++ API for FoLiA documents; an XML-based linguistic annotation format.";
homepage = https://proycon.github.io/folia/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];

longDescription = ''
A high-level C++ API to read, manipulate, and create FoLiA documents. FoLiA is an XML-based annotation format, suitable for the representation of linguistically annotated language resources. FoLiA’s intended use is as a format for storing and/or exchanging language resources, including corpora.
'';
};

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
--- a/configure.ac 2017-06-12 06:48:15.000000000 +0200
+++ b/configure.ac 2017-06-12 06:50:06.000000000 +0200
@@ -76,6 +76,10 @@
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$LIBS $ticcutils_LIBS"

+PKG_CHECK_MODULES([libxml2], [libxml-2.0 >= 2.6.16] )
+CXXFLAGS="$CXXFLAGS $libxml2_CFLAGS"
+LIBS="$LIBS $libxml2_LIBS"
+
AC_CONFIG_FILES([
Makefile
mbt.pc
40 changes: 40 additions & 0 deletions pkgs/development/libraries/languagemachines/mbt.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-mbt.json);
in

stdenv.mkDerivation {
name = "mbt";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "mbt-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
languageMachines.timbl
];
patches = [ ./mbt-add-libxml2-dep.patch ];
preConfigure = ''
sh bootstrap.sh
'';

meta = with stdenv.lib; {
description = "Memory Based Tagger";
homepage = https://languagemachines.github.io/mbt/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];

longDescription = ''
MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech.
Mbt is used by Frog for Dutch tagging.
'';
};

}
14 changes: 14 additions & 0 deletions pkgs/development/libraries/languagemachines/packages.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{ callPackage }:
{
ticcutils = callPackage ./ticcutils.nix { };
libfolia = callPackage ./libfolia.nix { };
ucto = callPackage ./ucto.nix { };
uctodata = callPackage ./uctodata.nix { };
timbl = callPackage ./timbl.nix { };
timblserver = callPackage ./timblserver.nix { };
mbt = callPackage ./mbt.nix { };
frog = callPackage ./frog.nix { };
frogdata = callPackage ./frogdata.nix { };

test = callPackage ./test.nix { };
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v0.13.7",
"url": "https://api.github.com/repos/LanguageMachines/frog/tarball/v0.13.7",
"sha256": "0swyfi3g862n888qj8v8kd18745hasy0vnc70i9qlv0ji0321bnf"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v0.13",
"url": "https://api.github.com/repos/LanguageMachines/frogdata/tarball/v0.13",
"sha256": "13mhv8qacl0n20ddl1ay49xi6h2m0a149ya3rrsmaah3x4adb4sg"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v1.7",
"url": "https://api.github.com/repos/LanguageMachines/libfolia/tarball/v1.7",
"sha256": "0hpxdry7n2887klryc587xv46p6z6jp6hz9x7k2pk5v7jb0z4s65"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v3.2.16",
"url": "https://api.github.com/repos/LanguageMachines/mbt/tarball/v3.2.16",
"sha256": "0f9f5l84m0lmmv4km9myn3yhy67jbmk3qn2fi40dy025gx4l0x3x"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v0.15",
"url": "https://api.github.com/repos/LanguageMachines/ticcutils/tarball/v0.15",
"sha256": "0lssb1klx2flmr6fy78j37i5lbq3gfhzjx24j6n72ndm2rvprvcn"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v6.4.9",
"url": "https://api.github.com/repos/LanguageMachines/timbl/tarball/v6.4.9",
"sha256": "1279npc3xlq05hnkylpbkgg941gjhvl6sd5fw4vgwcx2rwmmlaay"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v1.11",
"url": "https://api.github.com/repos/LanguageMachines/timblserver/tarball/v1.11",
"sha256": "02k8c704wr5miy82w6zj0imm7sdfnxf3db34qiaa8l3myhn17qlw"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v0.9.6",
"url": "https://api.github.com/repos/LanguageMachines/ucto/tarball/v0.9.6",
"sha256": "0fxq4j32g7kp6789xz23651c4v2j7zlz87cshfv9g1xjs7jxns3f"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"version": "v0.4",
"url": "https://api.github.com/repos/LanguageMachines/uctodata/tarball/v0.4",
"sha256": "02c78qmwi9ijpk5wila3p62fmfdy1rpmlvvzbxs3wg0rdb0nwvd2"
}
25 changes: 25 additions & 0 deletions pkgs/development/libraries/languagemachines/test.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{ runCommand
, languageMachines
}:

runCommand "frog-test" {} ''
${languageMachines.frog}/bin/frog >$out <<EOF
Dit is een test
EOF
echo "Frog output:"
cat $out
expected () {
echo "Test expectation failed: $@"
exit 1
}
lines="$(wc -l $out | awk '{print $1}')"
test 5 = $lines || expected "Five lines of output"
grep "is" $out | grep "zijn" >/dev/null || expected "Stemming works"
grep "een" $out | grep "onbep" >/dev/null || expected "Tagging works"
deps="$(echo $(awk 'BEGIN { FS = "\t*" } ; {print $1 " -> " $9 "; "}' <$out))"
test "1 -> 2; 2 -> 0; 3 -> 4; 4 -> 2; -> ;" = "$deps" || expected "Dependency parsing works"
''
29 changes: 29 additions & 0 deletions pkgs/development/libraries/languagemachines/ticcutils.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, zlib, bzip2, libtar }:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-ticcutils.json);
in

stdenv.mkDerivation {
name = "ticcutils";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "ticcutils-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive libxml2
# optional:
zlib bzip2 libtar
# broken but optional: boost
];
preConfigure = "sh bootstrap.sh";

meta = with stdenv.lib; {
description = "This module contains useful functions for general use in the TiCC software stack and beyond.";
homepage = https://github.com/LanguageMachines/ticcutils;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
};

}
36 changes: 36 additions & 0 deletions pkgs/development/libraries/languagemachines/timbl.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-timbl.json);
in

stdenv.mkDerivation {
name = "timbl";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "timbl-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
];
preConfigure = "sh bootstrap.sh";

meta = with stdenv.lib; {
description = "TiMBL implements several memory-based learning algorithms";
homepage = https://github.com/LanguageMachines/timbl/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];

longDescription = ''
TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
'';
};

}
37 changes: 37 additions & 0 deletions pkgs/development/libraries/languagemachines/timblserver.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:

let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-timblserver.json);
in

stdenv.mkDerivation {
name = "timblserver";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "timblserver-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
languageMachines.timbl
];
preConfigure = "sh bootstrap.sh";

meta = with stdenv.lib; {
description = "This server for TiMBL implements several memory-based learning algorithms";
homepage = https://github.com/LanguageMachines/timblserver/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];

longDescription = ''
This implements a server for TiMBL. TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
'';
};

}
Loading

0 comments on commit 58dc4a8

Please sign in to comment.