Skip to content

Commit

Permalink
Merge pull request #263 from commonsense/memory-consumption
Browse files Browse the repository at this point in the history
Memory consumption
  • Loading branch information
Robyn Speer committed May 28, 2019
2 parents 705144c + 9785d52 commit 7fb141f
Show file tree
Hide file tree
Showing 12 changed files with 499 additions and 240 deletions.
83 changes: 53 additions & 30 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ EMOJI_LANGUAGES = [
# Increment this number when we incompatibly change the parser
WIKT_PARSER_VERSION = "2"

CONVERT_SHARDS = 6
RETROFIT_SHARDS = 6
PROPAGATE_SHARDS = 6
PROPAGATE_SHARDS = 10

# Dataset filenames
# =================
Expand Down Expand Up @@ -573,7 +574,7 @@ rule assoc_uniq:
rule reduce_assoc:
input:
DATA + "/assoc/assoc.csv",
expand(DATA + "/vectors/{name}.h5", name=INPUT_EMBEDDINGS)
expand(DATA + "/vectors/{name}-converted.h5", name=INPUT_EMBEDDINGS)
output:
DATA + "/assoc/reduced.csv"
shell:
Expand All @@ -587,91 +588,113 @@ rule convert_word2vec:
DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/w2v-google-news.h5"
temp(expand(DATA + "/vectors/w2v-google-news-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_glove:
input:
DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/glove12-840B.h5"
temp(expand(DATA + "/vectors/glove12-840B-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_fasttext_crawl:
input:
DATA + "/raw/vectors/crawl-300d-2M.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/crawl-300d-2M.h5"
temp(expand(DATA + "/vectors/crawl-300d-2M-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_fasttext:
input:
DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/fasttext-wiki-{lang}.h5"
temp(expand(DATA + "/vectors/fasttext-wiki-{{lang}}-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_lexvec:
input:
DATA + "/raw/vectors/lexvec.commoncrawl.300d.W+C.pos.vectors.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/lexvec-commoncrawl.h5"
temp(expand(DATA + "/vectors/lexvec-commoncrawl-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_opensubtitles_ft:
input:
DATA + "/raw/vectors/ft-opensubtitles.vec.gz",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/fasttext-opensubtitles.h5"
temp(expand(DATA + "/vectors/fasttext-opensubtitles-converted.h5.shard{n}",
n=range(CONVERT_SHARDS)))
resources:
ram=24
ram=15
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {single_input} {output}")
output_prefix = output[0][:-len(".shard0")]
shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")

rule convert_polyglot:
input:
DATA + "/raw/vectors/polyglot-{language}.pkl",
DATA + "/db/wiktionary.db"
output:
DATA + "/vectors/polyglot-{language}.h5"
DATA + "/vectors/polyglot-{language}-converted.h5"
run:
single_input = input[0]
shell("CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {single_input} {output}")

rule join_convert:
input:
expand(DATA + "/vectors/{{name}}-converted.h5.shard{n}", n=range(CONVERT_SHARDS))
output:
DATA + "/vectors/{name}-converted.h5"
resources:
ram=15
shell:
"cn5-vectors join_shard_files -n {CONVERT_SHARDS} {output}"

rule retrofit:
input:
DATA + "/vectors/{name}.h5",
DATA + "/vectors/{name}-converted.h5",
DATA + "/assoc/reduced.csv"
output:
temp(expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS)))
resources:
ram=24
ram=15
shell:
"cn5-vectors retrofit -n {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5"

Expand All @@ -681,7 +704,7 @@ rule join_retrofit:
output:
DATA + "/vectors/{name}-retrofit.h5"
resources:
ram=24
ram=15
shell:
"cn5-vectors join_shard_files -n {RETROFIT_SHARDS} {output}"

Expand All @@ -692,7 +715,7 @@ rule merge_intersect:
DATA + "/vectors/numberbatch-retrofitted.h5",
DATA + "/vectors/intersection-projection.h5"
resources:
ram=24
ram=15
shell:
"cn5-vectors intersect {input} {output}"

Expand All @@ -703,7 +726,7 @@ rule propagate:
output:
temp(expand(DATA + "/vectors/numberbatch-biased.h5.shard{n}", n=range(PROPAGATE_SHARDS)))
resources:
ram=24
ram=15
shell:
"cn5-vectors propagate -n {PROPAGATE_SHARDS} {input} {DATA}/vectors/numberbatch-biased.h5"

Expand All @@ -713,7 +736,7 @@ rule join_propagate:
output:
DATA + "/vectors/numberbatch-biased.h5"
resources:
ram=24
ram=15
shell:
"cn5-vectors join_shard_files -n {PROPAGATE_SHARDS} --sort {output}"

Expand All @@ -723,14 +746,14 @@ rule debias:
output:
DATA + "/vectors/numberbatch.h5"
resources:
ram=30
ram=15
shell:
"cn5-vectors debias {input} {output}"

rule miniaturize:
input:
DATA + "/vectors/numberbatch-biased.h5",
DATA + "/vectors/w2v-google-news.h5"
DATA + "/vectors/w2v-google-news-converted.h5"
output:
DATA + "/vectors/mini.h5"
resources:
Expand Down Expand Up @@ -794,7 +817,7 @@ rule compare_embeddings:
input:
DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
DATA + "/vectors/glove12-840B.h5",
DATA + "/vectors/glove12-840B-converted.h5",
DATA + "/raw/vectors/fasttext-wiki-en.vec.gz",
DATA + "/vectors/numberbatch-biased.h5",
DATA + "/vectors/numberbatch.h5",
Expand Down
4 changes: 2 additions & 2 deletions conceptnet5/builders/reduce_assoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ def read_embedding_vocabularies(filenames):
"""
result = pd.Index([])
for filename in filenames:
vectors = load_hdf(filename)
result = result.union(vectors.index)
vectors_index = load_hdf(filename, index_only=True)
result = result.union(vectors_index)
return result


Expand Down
37 changes: 22 additions & 15 deletions conceptnet5/vectors/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,35 +85,45 @@ def run_retrofit(
@click.argument('glove_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nrows', '-n', default=500000)
def run_convert_glove(glove_filename, output_filename, nrows=500000):
convert_glove(glove_filename, output_filename, nrows)
@click.option('--nshards', default=6)
def run_convert_glove(glove_filename, output_filename, nrows=500000, nshards=6):
convert_glove(glove_filename, output_filename, nrows, nshards=nshards)


@cli.command(name='convert_fasttext')
@click.argument('fasttext_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nrows', '-n', default=500000)
@click.option('--language', '-l', default='en')
@click.option('--nshards', default=6)
def run_convert_fasttext(
fasttext_filename, output_filename, nrows=500000, language='en'
fasttext_filename, output_filename, nrows=500000, language='en', nshards=6
):
convert_fasttext(fasttext_filename, output_filename, nrows=nrows, language=language)
convert_fasttext(
fasttext_filename,
output_filename,
nrows=nrows,
language=language,
nshards=nshards,
)


@cli.command(name='convert_word2vec')
@click.argument('word2vec_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--nrows', '-n', default=500000)
def run_convert_word2vec(word2vec_filename, output_filename, nrows=500000):
convert_word2vec(word2vec_filename, output_filename, nrows)
@click.option('--nshards', default=6)
def run_convert_word2vec(word2vec_filename, output_filename, nrows=500000, nshards=6):
convert_word2vec(word2vec_filename, output_filename, nrows, nshards=nshards)


@cli.command(name='convert_polyglot')
@click.argument('polyglot_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
@click.option('--language', '-l')
def run_convert_polyglot(polyglot_filename, output_filename, language):
convert_polyglot(polyglot_filename, output_filename, language)
@click.option('--nshards', default=6)
def run_convert_polyglot(polyglot_filename, output_filename, language, nshards=6):
convert_polyglot(polyglot_filename, output_filename, language, nshards=nshards)


@cli.command(name='intersect')
Expand All @@ -128,7 +138,7 @@ def run_intersect(input_filenames, output_filename, projection_filename):
"""
intersected, projection = merge_intersect(input_filenames)
save_hdf(intersected, output_filename)
save_hdf(projection, projection_filename)
save_hdf(projection, projection_filename, format='mat')


@cli.command(name='debias')
Expand Down Expand Up @@ -246,7 +256,7 @@ def run_compare_embeddings(input_filenames, output_filename, run_analogies):
input_filenames, subset='all', run_analogies=run_analogies
)
print(results)
save_hdf(results, output_filename)
save_hdf(results, output_filename, format='mat')


@cli.command(name='comparison_graph')
Expand Down Expand Up @@ -280,11 +290,8 @@ def run_miniaturize(input_filename, extra_vocab_filename, output_filename, k):
"""
Save a smaller version of a frame, which includes frequent terms and phrases.
"""
frame = load_hdf(input_filename)
other_frame = load_hdf(extra_vocab_filename)
other_vocab = list(other_frame.index)
del other_frame
mini = miniaturize(frame, other_vocab=other_vocab, k=k)
other_vocab = list(load_hdf(extra_vocab_filename, index_only=True))
mini = miniaturize(input_filename, other_vocab=other_vocab, k=k)
save_hdf(mini, output_filename)


Expand Down
Loading

0 comments on commit 7fb141f

Please sign in to comment.