-
Notifications
You must be signed in to change notification settings - Fork 0
/
subsample.py
executable file
·73 lines (58 loc) · 2.62 KB
/
subsample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#! /usr/bin/env python
import collections
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse
import tables
import sys
genome = "mm10"
try:
print(sys.argv)
filtered_matrix_h5 = sys.argv[1]
filename = sys.argv[2]
subsample_bcs = int(sys.argv[3])
if len(sys.argv) == 5:
genome = sys.argv[4]
except:
print("Wrong Arguments")
np.random.seed(0)
GeneBCMatrix = collections.namedtuple('GeneBCMatrix', ['gene_ids', 'gene_names', 'barcodes', 'matrix'])
def get_matrix_from_h5(filename, genome):
with tables.open_file(filename, 'r') as f:
try:
dsets = {}
for node in f.walk_nodes('/' + genome, 'Array'):
dsets[node.name] = node.read()
matrix = sp_sparse.csc_matrix((dsets['data'], dsets['indices'], dsets['indptr']), shape=dsets['shape'])
return GeneBCMatrix(dsets['genes'], dsets['gene_names'], dsets['barcodes'], matrix)
except tables.NoSuchNodeError:
raise Exception("Genome %s does not exist in this file." % genome)
except KeyError:
raise Exception("File is missing one or more required datasets.")
def save_matrix_to_h5(gbm, filename, genome):
flt = tables.Filters(complevel=1)
with tables.open_file(filename, 'w', filters=flt) as f:
try:
group = f.create_group(f.root, genome)
f.create_carray(group, 'genes', obj=gbm.gene_ids)
f.create_carray(group, 'gene_names', obj=gbm.gene_names)
f.create_carray(group, 'barcodes', obj=gbm.barcodes)
f.create_carray(group, 'data', obj=gbm.matrix.data)
f.create_carray(group, 'indices', obj=gbm.matrix.indices)
f.create_carray(group, 'indptr', obj=gbm.matrix.indptr)
f.create_carray(group, 'shape', obj=gbm.matrix.shape)
except:
raise Exception("Failed to write H5 file.")
def subsample_matrix(gbm, barcode_indices):
return GeneBCMatrix(gbm.gene_ids, gbm.gene_names, gbm.barcodes[barcode_indices], gbm.matrix[:, barcode_indices])
def get_expression(gbm, gene_name):
gene_indices = np.where(gbm.gene_names == gene_name)[0]
if len(gene_indices) == 0:
raise Exception("%s was not found in list of gene names." % gene_name)
return gbm.matrix[gene_indices[0], :].toarray().squeeze()
gene_bc_matrix = get_matrix_from_h5(filtered_matrix_h5, genome)
subset = np.sort(np.random.choice(gene_bc_matrix.barcodes.size, size=subsample_bcs, replace=False))
subsampled_matrix = subsample_matrix(gene_bc_matrix, subset)
save_matrix_to_h5(subsampled_matrix, filename, genome)