Merge pull request #263 from commonsense/memory-consumption

Memory consumption
commonsense · May 28, 2019 · 7fb141f · 7fb141f
2 parents 705144c + 9785d52
commit 7fb141f
Show file tree

Hide file tree

Showing 12 changed files with 499 additions and 240 deletions.
diff --git a/Snakefile b/Snakefile
@@ -56,8 +56,9 @@ EMOJI_LANGUAGES = [
 # Increment this number when we incompatibly change the parser
 WIKT_PARSER_VERSION = "2"
 
+CONVERT_SHARDS = 6
 RETROFIT_SHARDS = 6
-PROPAGATE_SHARDS = 6
+PROPAGATE_SHARDS = 10
 
 # Dataset filenames
 # =================
@@ -573,7 +574,7 @@ rule assoc_uniq:
 rule reduce_assoc:
     input:
         DATA + "/assoc/assoc.csv",
-        expand(DATA + "/vectors/{name}.h5", name=INPUT_EMBEDDINGS)
+        expand(DATA + "/vectors/{name}-converted.h5", name=INPUT_EMBEDDINGS)
     output:
         DATA + "/assoc/reduced.csv"
     shell:
@@ -587,91 +588,113 @@ rule convert_word2vec:
         DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/w2v-google-news.h5"
+        temp(expand(DATA + "/vectors/w2v-google-news-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_word2vec -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_glove:
     input:
         DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/glove12-840B.h5"
+        temp(expand(DATA + "/vectors/glove12-840B-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_glove -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_fasttext_crawl:
     input:
         DATA + "/raw/vectors/crawl-300d-2M.vec.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/crawl-300d-2M.h5"
+        temp(expand(DATA + "/vectors/crawl-300d-2M-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_fasttext:
     input:
         DATA + "/raw/vectors/fasttext-wiki-{lang}.vec.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/fasttext-wiki-{lang}.h5"
+        temp(expand(DATA + "/vectors/fasttext-wiki-{{lang}}-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} -l {wildcards.lang} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_lexvec:
     input:
         DATA + "/raw/vectors/lexvec.commoncrawl.300d.W+C.pos.vectors.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/lexvec-commoncrawl.h5"
+        temp(expand(DATA + "/vectors/lexvec-commoncrawl-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_opensubtitles_ft:
     input:
         DATA + "/raw/vectors/ft-opensubtitles.vec.gz",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/fasttext-opensubtitles.h5"
+        temp(expand(DATA + "/vectors/fasttext-opensubtitles-converted.h5.shard{n}",
+                    n=range(CONVERT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     run:
         single_input = input[0]
-        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} {single_input} {output}")
+        output_prefix = output[0][:-len(".shard0")]
+        shell("CONCEPTNET_DATA=data cn5-vectors convert_fasttext -n {MULTILINGUAL_SOURCE_EMBEDDING_ROWS} --nshards {CONVERT_SHARDS} {single_input} {output_prefix}")
 
 rule convert_polyglot:
     input:
         DATA + "/raw/vectors/polyglot-{language}.pkl",
         DATA + "/db/wiktionary.db"
     output:
-        DATA + "/vectors/polyglot-{language}.h5"
+        DATA + "/vectors/polyglot-{language}-converted.h5"
     run:
         single_input = input[0]
         shell("CONCEPTNET_DATA=data cn5-vectors convert_polyglot -l {wildcards.language} {single_input} {output}")
 
+rule join_convert:
+    input:
+        expand(DATA + "/vectors/{{name}}-converted.h5.shard{n}", n=range(CONVERT_SHARDS))
+    output:
+        DATA + "/vectors/{name}-converted.h5"
+    resources:
+        ram=15
+    shell:
+        "cn5-vectors join_shard_files -n {CONVERT_SHARDS} {output}"
+
 rule retrofit:
     input:
-        DATA + "/vectors/{name}.h5",
+        DATA + "/vectors/{name}-converted.h5",
         DATA + "/assoc/reduced.csv"
     output:
         temp(expand(DATA + "/vectors/{{name}}-retrofit.h5.shard{n}", n=range(RETROFIT_SHARDS)))
     resources:
-        ram=24
+        ram=15
     shell:
         "cn5-vectors retrofit -n {RETROFIT_SHARDS} {input} {DATA}/vectors/{wildcards.name}-retrofit.h5"
 
@@ -681,7 +704,7 @@ rule join_retrofit:
     output:
         DATA + "/vectors/{name}-retrofit.h5"
     resources:
-        ram=24
+        ram=15
     shell:
         "cn5-vectors join_shard_files -n {RETROFIT_SHARDS} {output}"
 
@@ -692,7 +715,7 @@ rule merge_intersect:
         DATA + "/vectors/numberbatch-retrofitted.h5",
         DATA + "/vectors/intersection-projection.h5"
     resources:
-        ram=24
+        ram=15
     shell:
         "cn5-vectors intersect {input} {output}"
 
@@ -703,7 +726,7 @@ rule propagate:
     output:
         temp(expand(DATA + "/vectors/numberbatch-biased.h5.shard{n}", n=range(PROPAGATE_SHARDS)))
     resources:
-        ram=24
+        ram=15
     shell:
         "cn5-vectors propagate -n {PROPAGATE_SHARDS} {input} {DATA}/vectors/numberbatch-biased.h5"
 
@@ -713,7 +736,7 @@ rule join_propagate:
     output:
         DATA + "/vectors/numberbatch-biased.h5"
     resources:
-        ram=24
+        ram=15
     shell:
         "cn5-vectors join_shard_files -n {PROPAGATE_SHARDS} --sort {output}"
 
@@ -723,14 +746,14 @@ rule debias:
     output:
         DATA + "/vectors/numberbatch.h5"
     resources:
-        ram=30
+        ram=15
     shell:
         "cn5-vectors debias {input} {output}"
 
 rule miniaturize:
     input:
         DATA + "/vectors/numberbatch-biased.h5",
-        DATA + "/vectors/w2v-google-news.h5"
+        DATA + "/vectors/w2v-google-news-converted.h5"
     output:
         DATA + "/vectors/mini.h5"
     resources:
@@ -794,7 +817,7 @@ rule compare_embeddings:
     input:
         DATA + "/raw/vectors/GoogleNews-vectors-negative300.bin.gz",
         DATA + "/raw/vectors/glove12.840B.300d.txt.gz",
-        DATA + "/vectors/glove12-840B.h5",
+        DATA + "/vectors/glove12-840B-converted.h5",
         DATA + "/raw/vectors/fasttext-wiki-en.vec.gz",
         DATA + "/vectors/numberbatch-biased.h5",
         DATA + "/vectors/numberbatch.h5",

diff --git a/conceptnet5/builders/reduce_assoc.py b/conceptnet5/builders/reduce_assoc.py
@@ -181,8 +181,8 @@ def read_embedding_vocabularies(filenames):
     """
     result = pd.Index([])
     for filename in filenames:
-        vectors = load_hdf(filename)
-        result = result.union(vectors.index)
+        vectors_index = load_hdf(filename, index_only=True)
+        result = result.union(vectors_index)
     return result
 
 

diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py
@@ -85,35 +85,45 @@ def run_retrofit(
 @click.argument('glove_filename', type=click.Path(readable=True, dir_okay=False))
 @click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
 @click.option('--nrows', '-n', default=500000)
-def run_convert_glove(glove_filename, output_filename, nrows=500000):
-    convert_glove(glove_filename, output_filename, nrows)
+@click.option('--nshards', default=6)
+def run_convert_glove(glove_filename, output_filename, nrows=500000, nshards=6):
+    convert_glove(glove_filename, output_filename, nrows, nshards=nshards)
 
 
 @cli.command(name='convert_fasttext')
 @click.argument('fasttext_filename', type=click.Path(readable=True, dir_okay=False))
 @click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
 @click.option('--nrows', '-n', default=500000)
 @click.option('--language', '-l', default='en')
+@click.option('--nshards', default=6)
 def run_convert_fasttext(
-    fasttext_filename, output_filename, nrows=500000, language='en'
+    fasttext_filename, output_filename, nrows=500000, language='en', nshards=6
 ):
-    convert_fasttext(fasttext_filename, output_filename, nrows=nrows, language=language)
+    convert_fasttext(
+        fasttext_filename,
+        output_filename,
+        nrows=nrows,
+        language=language,
+        nshards=nshards,
+    )
 
 
 @cli.command(name='convert_word2vec')
 @click.argument('word2vec_filename', type=click.Path(readable=True, dir_okay=False))
 @click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
 @click.option('--nrows', '-n', default=500000)
-def run_convert_word2vec(word2vec_filename, output_filename, nrows=500000):
-    convert_word2vec(word2vec_filename, output_filename, nrows)
+@click.option('--nshards', default=6)
+def run_convert_word2vec(word2vec_filename, output_filename, nrows=500000, nshards=6):
+    convert_word2vec(word2vec_filename, output_filename, nrows, nshards=nshards)
 
 
 @cli.command(name='convert_polyglot')
 @click.argument('polyglot_filename', type=click.Path(readable=True, dir_okay=False))
 @click.argument('output_filename', type=click.Path(writable=True, dir_okay=False))
 @click.option('--language', '-l')
-def run_convert_polyglot(polyglot_filename, output_filename, language):
-    convert_polyglot(polyglot_filename, output_filename, language)
+@click.option('--nshards', default=6)
+def run_convert_polyglot(polyglot_filename, output_filename, language, nshards=6):
+    convert_polyglot(polyglot_filename, output_filename, language, nshards=nshards)
 
 
 @cli.command(name='intersect')
@@ -128,7 +138,7 @@ def run_intersect(input_filenames, output_filename, projection_filename):
     """
     intersected, projection = merge_intersect(input_filenames)
     save_hdf(intersected, output_filename)
-    save_hdf(projection, projection_filename)
+    save_hdf(projection, projection_filename, format='mat')
 
 
 @cli.command(name='debias')
@@ -246,7 +256,7 @@ def run_compare_embeddings(input_filenames, output_filename, run_analogies):
         input_filenames, subset='all', run_analogies=run_analogies
     )
     print(results)
-    save_hdf(results, output_filename)
+    save_hdf(results, output_filename, format='mat')
 
 
 @cli.command(name='comparison_graph')
@@ -280,11 +290,8 @@ def run_miniaturize(input_filename, extra_vocab_filename, output_filename, k):
     """
     Save a smaller version of a frame, which includes frequent terms and phrases.
     """
-    frame = load_hdf(input_filename)
-    other_frame = load_hdf(extra_vocab_filename)
-    other_vocab = list(other_frame.index)
-    del other_frame
-    mini = miniaturize(frame, other_vocab=other_vocab, k=k)
+    other_vocab = list(load_hdf(extra_vocab_filename, index_only=True))
+    mini = miniaturize(input_filename, other_vocab=other_vocab, k=k)
     save_hdf(mini, output_filename)