Turn CONLL-U documents into Pandas DataFrames for easy NLP!
pip install conll-df
curl -O https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/en-ud-train.conllu
import pandas as pd
from conll_df import conll_df
path = 'en-ud-train.conllu'
df = conll_df(path, file_index=False)
df.head(40).to_html()
w | l | x | p | g | f | e | type | gender | Case | Definite | Degree | Foreign | Gender | Mood | Number | Person | Poss | Reflex | Tense | Voice | Type | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
s | i | ||||||||||||||||||||||
1 | 1.0 | Al | Al | PROPN | NNP | 0 | root | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ |
2.0 | - | - | PUNCT | HYPH | 1 | punct | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
3.0 | Zaman | Zaman | PROPN | NNP | 1 | flat | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ | |
4.0 | : | : | PUNCT | : | 1 | punct | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
5.0 | American | american | ADJ | JJ | 6 | amod | _ | _ | _ | _ | _ | Pos | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
6.0 | forces | force | NOUN | NNS | 7 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Plur | _ | _ | _ | _ | _ | _ | |
7.0 | killed | kill | VERB | VBD | 1 | parataxis | _ | _ | _ | _ | _ | _ | _ | _ | Ind | _ | _ | _ | _ | Past | _ | _ | |
8.0 | Shaikh | Shaikh | PROPN | NNP | 7 | obj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ | |
9.0 | Abdullah | Abdullah | PROPN | NNP | 8 | flat | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ | |
10.0 | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | |
2 | 1.0 | [ | [ | PUNCT | -LRB- | 10 | punct | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ |
2.0 | This | this | DET | DT | 3 | det | _ | Dem | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | Dem | |
3.0 | killing | killing | NOUN | NN | 10 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ | |
4.0 | of | of | ADP | IN | 7 | case | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
5.0 | a | a | DET | DT | 7 | det | _ | Art | _ | _ | Ind | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | Art | |
6.0 | respected | respected | ADJ | JJ | 7 | amod | _ | _ | _ | _ | _ | Pos | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
7.0 | cleric | cleric | NOUN | NN | 3 | nmod | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ | |
8.0 | will | will | AUX | MD | 10 | aux | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
9.0 | be | be | AUX | VB | 10 | aux | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | |
10.0 | causing | cause | VERB | VBG | 0 | root | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ | _ |
Name | Type | Description |
---|---|---|
path |
str |
Path to CONLL-U file |
add_gov |
bool |
Create extra columns for governor word, lemma, POS and function |
skip_morph |
bool |
Enable if you'd like to skip the parsing of morphological and extra fields |
v2 |
bool /'auto' |
CONLL-U version of file. By default, detect from data |
drop |
list |
list of column names you don't need |
add_meta |
bool |
add columns for sentence-level metadata |
categories |
bool |
Convert columns to categorical format where possible |
file_index |
bool |
Include filename in index levels |
extra_fields |
list /'auto' |
`Names of extra fields in the last column. By default, detect from data |
kwargs |
dict |
additional arguments to pass to pandas.read_csv() |
Configuring these arguments can increase speed a lot, so if speed is important to you, turn off the features you don't need.
If you're working with Python and CONLL-U, you might want to take a look at tücan, which provides a command-line and web-app interface for exploring CONLL-U datasets.
Alternatively, there's plenty of cool stuff you can do with Pandas by itself. Here are some toy examples:
piv = df.pivot_table(columns='f', index=['x'], aggfunc=len)
piv.fillna(0).astype(int).to_html()
f | _ | acl | acl:relcl | advcl | advmod | amod | appos | aux | aux:pass | case | cc | cc:preconj | ccomp | compound | compound:prt | conj | cop | csubj | csubj:pass | dep | det | det:predet | discourse | dislocated | expl | fixed | flat | flat:foreign | goeswith | iobj | list | mark | nmod | nmod:npmod | nmod:poss | nmod:tmod | nsubj | nsubj:pass | nummod | obj | obl | obl:npmod | obl:tmod | orphan | parataxis | punct | reparandum | root | vocative | xcomp |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
x | ||||||||||||||||||||||||||||||||||||||||||||||||||
ADJ | 1 | 26 | 120 | 240 | 100 | 8344 | 38 | 0 | 0 | 34 | 0 | 0 | 282 | 19 | 2 | 842 | 0 | 9 | 0 | 0 | 3 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 15 | 9 | 63 | 5 | 1 | 0 | 88 | 9 | 5 | 142 | 124 | 28 | 4 | 2 | 167 | 0 | 0 | 1239 | 0 | 512 |
ADP | 0 | 2 | 11 | 0 | 26 | 0 | 0 | 1 | 0 | 16267 | 2 | 0 | 1 | 20 | 732 | 8 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 262 | 0 | 0 | 0 | 0 | 0 | 91 | 25 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 184 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
ADV | 0 | 8 | 16 | 60 | 9138 | 6 | 0 | 4 | 0 | 97 | 60 | 19 | 33 | 19 | 12 | 121 | 0 | 1 | 0 | 1 | 0 | 0 | 9 | 0 | 5 | 131 | 0 | 0 | 2 | 0 | 0 | 380 | 61 | 1 | 0 | 0 | 5 | 0 | 5 | 12 | 100 | 4 | 2 | 4 | 22 | 0 | 0 | 190 | 0 | 20 |
AUX | 0 | 0 | 15 | 31 | 0 | 0 | 1 | 6481 | 1325 | 0 | 0 | 0 | 8 | 1 | 0 | 10 | 4451 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 1 | 13 | 0 | 1 |
CCONJ | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 6599 | 82 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
DET | 0 | 0 | 0 | 2 | 10 | 0 | 4 | 0 | 0 | 1 | 0 | 2 | 2 | 5 | 0 | 32 | 0 | 0 | 0 | 0 | 15736 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 22 | 24 | 2 | 0 | 0 | 96 | 9 | 5 | 76 | 52 | 3 | 0 | 0 | 2 | 0 | 10 | 22 | 2 | 3 |
INTJ | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 587 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 81 | 0 | 0 |
NOUN | 0 | 16 | 86 | 181 | 23 | 17 | 709 | 0 | 0 | 5 | 2 | 0 | 247 | 4605 | 0 | 2416 | 0 | 3 | 1 | 3 | 0 | 0 | 14 | 2 | 0 | 24 | 54 | 0 | 4 | 37 | 203 | 0 | 4602 | 80 | 219 | 205 | 4082 | 568 | 36 | 6911 | 6235 | 359 | 483 | 12 | 245 | 12 | 1 | 1896 | 22 | 161 |
NUM | 0 | 0 | 3 | 7 | 9 | 15 | 156 | 0 | 0 | 2 | 0 | 0 | 7 | 225 | 0 | 74 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 50 | 0 | 242 | 5 | 1 | 46 | 83 | 4 | 2375 | 81 | 199 | 7 | 11 | 0 | 14 | 0 | 2 | 370 | 1 | 7 |
PART | 0 | 0 | 0 | 9 | 1572 | 0 | 0 | 0 | 0 | 684 | 0 | 0 | 2 | 2 | 0 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 3260 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 2 | 0 | 9 |
PRON | 0 | 0 | 3 | 20 | 1 | 0 | 11 | 0 | 0 | 1 | 0 | 0 | 21 | 5 | 0 | 148 | 0 | 1 | 0 | 0 | 8 | 2 | 0 | 0 | 580 | 0 | 0 | 0 | 0 | 314 | 1 | 0 | 311 | 27 | 3054 | 0 | 10348 | 454 | 0 | 2362 | 782 | 16 | 0 | 0 | 15 | 0 | 4 | 80 | 3 | 5 |
PROPN | 0 | 3 | 8 | 28 | 0 | 12 | 511 | 0 | 0 | 2 | 0 | 0 | 23 | 3163 | 0 | 795 | 0 | 2 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 1382 | 0 | 0 | 22 | 117 | 0 | 1545 | 22 | 396 | 36 | 1548 | 97 | 0 | 527 | 1410 | 24 | 50 | 0 | 51 | 0 | 1 | 1029 | 94 | 45 |
PUNCT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 100 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 23524 | 0 | 41 | 0 | 0 |
SCONJ | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 74 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 0 | 0 | 3704 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
SYM | 0 | 0 | 1 | 2 | 8 | 0 | 20 | 0 | 0 | 103 | 16 | 0 | 1 | 50 | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 74 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5 | 0 | 33 | 5 | 0 | 0 | 2 | 5 | 0 | 52 | 35 | 16 | 0 | 0 | 7 | 57 | 1 | 90 | 0 | 1 |
VERB | 19 | 1420 | 1731 | 3260 | 8 | 663 | 67 | 29 | 39 | 166 | 2 | 0 | 1776 | 47 | 1 | 3037 | 0 | 261 | 4 | 1 | 0 | 0 | 9 | 0 | 0 | 8 | 1 | 0 | 0 | 0 | 12 | 8 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 19 | 10 | 2 | 0 | 4 | 892 | 0 | 5 | 7324 | 0 | 2243 |
X | 0 | 0 | 0 | 0 | 48 | 6 | 49 | 0 | 0 | 5 | 2 | 0 | 0 | 49 | 0 | 43 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12 | 257 | 0 | 57 | 0 | 7 | 0 | 0 | 1 | 1 | 0 | 114 | 3 | 15 | 2 | 0 | 0 | 7 | 3 | 0 | 165 | 0 | 0 |
def searcher(df, column, query, inverse=False):
"""Search column for regex query"""
bool_ix = df[column].str.contains(query)
return df[bool_ix] if not inverse else df[~bool_ix]
pd.DataFrame.search = searcher
# get nominal subjects starting with a, b or c
df.search('f', 'nsubj').search('w', '^[abc]').head().to_html()
w | l | x | p | g | f | e | type | gender | Case | Definite | Degree | Foreign | Gender | Mood | Number | Person | Poss | Reflex | Tense | Voice | Type | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
s | i | ||||||||||||||||||||||
3 | 4.0 | authorities | authority | NOUN | NNS | 5 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Plur | _ | _ | _ | _ | _ | _ |
8 | 2.0 | cells | cell | NOUN | NNS | 4 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Plur | _ | _ | _ | _ | _ | _ |
9 | 3.0 | announcement | announcement | NOUN | NN | 6 | nsubj:pass | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ |
12 | 3.0 | commander | commander | NOUN | NN | 7 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Sing | _ | _ | _ | _ | _ | _ |
9.0 | bombings | bombing | NOUN | NNS | 11 | nsubj | _ | _ | _ | _ | _ | _ | _ | _ | _ | Plur | _ | _ | _ | _ | _ | _ |
def _conclines(match, df=False, column=False):
"""Apply this to each sentence"""
s, i = match.name
sent = df['w'].loc[s]
match['left'] = sent.loc[:i-1].str.cat(sep=' ')
match['right'] = sent.loc[i+1:].str.cat(sep=' ')
formatted = match['w']
if column != 'w':
formatted += '/' + match[column]
match['match'] = formatted
return match
def conc(df, column, query):
"""Build simple concordancer"""
# get query matches
matches = df[df[column].str.contains(query)]
# add left and right columns
lines = matches.apply(_conclines, df=df, column=column, axis=1)
return lines[['left', 'match', 'right']]
pd.DataFrame.conc = conc
lines = df.head(1000).conc('l', 'be')
lines.head(10).to_html()
left | match | right | ||
---|---|---|---|---|
s | i | |||
2 | 9.0 | [ This killing of a respected cleric will | be/be | causing us trouble for years to come . ] |
4 | 4.0 | Two of them | were/be | being run by 2 officials of the Ministry of th... |
5.0 | Two of them were | being/be | run by 2 officials of the Ministry of the Inte... | |
5 | 5.0 | The MoI in Iraq | is/be | equivalent to the US FBI , so this would be li... |
15.0 | The MoI in Iraq is equivalent to the US FBI , ... | be/be | like having J. Edgar Hoover unwittingly employ... | |
27.0 | The MoI in Iraq is equivalent to the US FBI , ... | members/member | of the Weathermen bombers back in the 1960s . | |
31.0 | The MoI in Iraq is equivalent to the US FBI , ... | bombers/bomber | back in the 1960s . | |
6 | 3.0 | The third | was/be | being run by the head of an investment firm . |
4.0 | The third was | being/be | run by the head of an investment firm . | |
7 | 5.0 | You wonder if he | was/be | manipulating the market with his bombing targe... |