Skip to content

Commit

Permalink
feat: support for zipped files
Browse files Browse the repository at this point in the history
  • Loading branch information
m0nhawk committed Feb 17, 2020
1 parent 7474892 commit 31fd6ce
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 24 deletions.
70 changes: 47 additions & 23 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,68 @@
import os
import time
from concurrent.futures import ProcessPoolExecutor
import zipfile


def merge():
dirs = sorted([os.path.join('/data', d) for d in os.listdir('/data') if os.path.isdir(os.path.join('/data', d)) and d != 'data'])
dirs = sorted([os.path.join('/data', d) for d in os.listdir('/data') if os.path.isdir(os.path.join('/data', d)) and d != 'data' and "_merged" not in d])
if len(dirs) == 0:
print('no manifest is exported')
return
manifest = dirs[-1]

path = os.path.join(manifest, 'by-filepath', 'clinical', 'tsv')
path = os.path.join(manifest, 'by-filepath', 'clinical', 'archive')

if os.path.isfile(manifest+'.sync'):
print("already converted")
return
print('getting latest manifest')
print(path)
for tbl_name in ['ActionableMutations', 'ICDCode', 'Oncology_Primary', 'Patients']:
print(tbl_name)
if not os.path.isdir(os.path.join(path, tbl_name)):
print("mounting not finished yet")
return

merged_path = manifest + "_merged"

with zipfile.ZipFile("headers.zip") as z:
z.extractall(path=merged_path)

if not os.path.isdir(path):
print("mounting not finished yet")
return

filenames = glob.glob(os.path.join(path, '*.zip'))

for filename in filenames:
with zipfile.ZipFile(filename) as z:
for f in z.namelist():
with z.open(f) as zipped_file:
data = zipped_file.read().decode()

merged_file = f.split('/')[1]

with open(os.path.join(merged_path, merged_file), "a") as output:
output.write(data)

# for tbl_name in ['ActionableMutations', 'ICDCode', 'Oncology_Primary', 'Patients']:
# print(tbl_name)
# if not os.path.isdir(os.path.join(path, tbl_name)):
# print("mounting not finished yet")
# return

filenames = glob.glob(os.path.join(path, tbl_name, '*.tsv'))

with open(os.path.join('/data', '{}.tsv'.format(tbl_name)), 'w') as outfile:
with ProcessPoolExecutor(max_workers=10) as pool:
result = pool.map(readfile, filenames)
count = 0
first = True
for r in result:
if count % 100 == 0:
print("Processed {} {}".format(count, tbl_name))
count += 1
if first:
outfile.writelines(r)
first = False
else:
outfile.writelines(r[1:])
# filenames = glob.glob(os.path.join(path, tbl_name, '*.tsv'))

# with open(os.path.join('/data', '{}.tsv'.format(tbl_name)), 'w') as outfile:
# with ProcessPoolExecutor(max_workers=10) as pool:
# result = pool.map(readfile, filenames)
# count = 0
# first = True
# for r in result:
# if count % 100 == 0:
# print("Processed {} {}".format(count, tbl_name))
# count += 1
# if first:
# outfile.writelines(r)
# first = False
# else:
# outfile.writelines(r[1:])
with open(manifest+'.sync', 'w') as f:
f.write("finished")

Expand Down
Binary file added headers.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion sidecarDockerrun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ while true; do
gen3-fuse -config=/fuse-config.yaml -manifest=/manifest.json -mount-point=/data/$FILENAME -hostname=https://$HOSTNAME -wtsURL=http://workspace-token-service.$NAMESPACE >/proc/1/fd/1 2>/proc/1/fd/2
fi
else
OLDDIR=`df /data/manifest* | grep manifest | cut -d'/' -f 3 | head -n 1`
OLDDIR=`df /data/manifest* | grep manifest | cut -d'/' -f 7 | head -n 1`
if [ ! -z "$OLDDIR" ]; then
echo unmount old manifest $OLDDIR
fusermount -u /data/$OLDDIR; rm -rf /data/$OLDDIR
Expand Down

0 comments on commit 31fd6ce

Please sign in to comment.