Skip to content

Commit

Permalink
stats: add option to output in machine-friendly tabular format
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Sep 16, 2017
1 parent 32a6d7a commit a6aae0f
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 4 deletions.
2 changes: 2 additions & 0 deletions dev-version.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- `seqkit convert`: fix bug of read quality containing only 3 or less values.
[shenwei356/bio/issues/3](https://github.com/shenwei356/bio/issues/3)
- `seqkit stats`: add option `-T/--tabular` to output in machine-friendly tabular format
[#23](https://github.com/shenwei356/seqkit/issues/23)
- fix typo [#22](https://github.com/shenwei356/seqkit/issues/22)
26 changes: 23 additions & 3 deletions doc/docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -573,18 +573,19 @@ Examples
Usage

```
simple statistics of FASTA files
simple statistics of FASTA/Q files
Usage:
seqkit stats [flags]
Aliases:
stats, stat
Flags:
-a, --all all statistics, including sum_gap, N50, L50
-a, --all all statistics, including sum_gap, N50
-G, --gap-letters string gap letters (default "- .")
-h, --help help for stats
-T, --tabular output in machine-friendly tabular format
```

Expand All @@ -599,6 +600,25 @@ Eexamples
reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229
reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225

1. Machine-friendly tabular format

$ seqkit stats *.f{a,q}.gz -T
file format type num_seqs sum_len min_len avg_len max_len
hairpin.fa.gz FASTA RNA 28645 2949871 39 103.0 2354
mature.fa.gz FASTA RNA 35828 781222 15 21.8 34
Illimina1.8.fq.gz FASTQ DNA 10000 1500000 150 150.0 150
reads_1.fq.gz FASTQ DNA 2500 567516 226 227.0 229
reads_2.fq.gz FASTQ DNA 2500 560002 223 224.0 225

$ seqkit stats *.f{a,q}.gz -T | csvtk pretty -t
file format type num_seqs sum_len min_len avg_len max_len
hairpin.fa.gz FASTA RNA 28645 2949871 39 103.0 2354
mature.fa.gz FASTA RNA 35828 781222 15 21.8 34
Illimina1.8.fq.gz FASTQ DNA 10000 1500000 150 150.0 150
reads_1.fq.gz FASTQ DNA 2500 567516 226 227.0 229
reads_2.fq.gz FASTQ DNA 2500 560002 223 224.0 225


1. Extra information

$ seqkit stats *.f{a,q}.gz -a
Expand Down
50 changes: 49 additions & 1 deletion seqkit/cmd/stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"io"
"runtime"
"sort"
"strings"

"github.com/cznic/sortutil"
"github.com/dustin/go-humanize"
Expand Down Expand Up @@ -66,6 +67,7 @@ var statCmd = &cobra.Command{
gapLettersBytes := []byte(gapLetters)

all := getFlagBool(cmd, "all")
tabular := getFlagBool(cmd, "tabular")

files := getFileList(args)

Expand Down Expand Up @@ -154,6 +156,51 @@ var statCmd = &cobra.Command{
}
}

// tabular output
if tabular {
colnames := []string{
"file",
"format",
"type",
"num_seqs",
"sum_len",
"min_len",
"avg_len",
"max_len",
}
if all {
colnames = append(colnames, []string{"sum_gap", "N50"}...)
}
outfh.WriteString(strings.Join(colnames, "\t") + "\n")

for _, info := range statInfos {
if !all {
outfh.WriteString(fmt.Sprintf("%s\t%s\t%s\t%d\t%d\t%d\t%.1f\t%d\n",
info.file,
info.format,
info.t,
info.num,
info.lenSum,
info.lenMin,
info.lenAvg,
info.lenMax))
} else {
outfh.WriteString(fmt.Sprintf("%s\t%s\t%s\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n",
info.file,
info.format,
info.t,
info.num,
info.lenSum,
info.lenMin,
info.lenAvg,
info.lenMax,
info.gapSum,
info.N50))
}
}
return
}

// format output
columns := []prettytable.Column{
{Header: "file"},
Expand Down Expand Up @@ -226,6 +273,7 @@ type statInfo struct {
func init() {
RootCmd.AddCommand(statCmd)

statCmd.Flags().BoolP("tabular", "T", false, "output in machine-friendly tabular format")
statCmd.Flags().StringP("gap-letters", "G", "- .", "gap letters")
statCmd.Flags().BoolP("all", "a", false, "all statistics, including sum_gap, N50, L50")
statCmd.Flags().BoolP("all", "a", false, "all statistics, including sum_gap, N50")
}

0 comments on commit a6aae0f

Please sign in to comment.