forked from pbfrandsen/insect_genome_assemblies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_genome_stats.py
51 lines (43 loc) · 1.79 KB
/
extract_genome_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
import csv
filename = sys.argv[1]
taxon_name = filename.split(".")[0]
outfilename = sys.argv[2]
accession_filename = sys.argv[3]
revised_accessions = set()
accessions = set()
genome_number = 0
chrom_genome_number = 0
contig_n50_greater = 0
with open(accession_filename) as access_file:
for line in access_file:
revised_accessions.add(line.strip())
with open(filename) as infile:
with open(outfilename, "a") as outfile:
csvfile = csv.reader(infile)
for count,line in enumerate(csvfile):
if count > 0:
accession = line[2]
# print("This is the accession: " + accession + "\n")
assembly_level = line[4]
# print("This is the assembly level: " + assembly_level + "\n")
contig_n50 = line[6]
# print("This is the contig N50: " + contig_n50 + "\n")
display_name = line[7]
extra_stuff = line[9].split(",")
length = line[10]
date = line[-1].strip()
for item in extra_stuff:
if "sci_name" in item:
species_name = item.split(": ")[1]
if accession in revised_accessions:
accessions.add(accession)
genome_number += 1
if assembly_level == "Chromosome":
chrom_genome_number += 1
if int(contig_n50) > 999999:
contig_n50_greater += 1
outfile.write(taxon_name + "," + str(species_name) + "," +
str(display_name) + "," + str(accession) + "," +
str(contig_n50) + "," + str(assembly_level) + "," +
str(length) + "," + str(date) + "\n")