plantinformatics · rsuchecki · Jan 8, 2020 · Dec 10, 2019 · Dec 10, 2019 · Dec 11, 2019
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,68 @@
+name: CI
+
+on: [push]
+
+jobs:
+  docker:
+    runs-on: ubuntu-18.04
+    steps:
+      - name: Install GraphViz
+        run: |
+          sudo apt-get update && sudo apt-get install -y graphviz
+      - name: Install Nextflow
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+      - name: Check out code
+        uses: actions/checkout@v1    
+      - name: Test workflow (docker) 
+        run: |
+          NXF_VER=19.10.0 nextflow run ${GITHUB_WORKSPACE} -profile CI,docker --max_cpus 2 --max_memory 4.GB -ansi-log false
+
+  singularity:
+    runs-on: ubuntu-18.04
+    # runs-on: ubuntu-18.04
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v1        
+      - name: Set up Go
+        uses: actions/setup-go@v1
+        with:
+          go-version: 1.13
+        id: go
+      - name: Install Dependencies for Singularity 
+        run: |
+          sudo apt-get update && sudo apt-get install -y \
+            build-essential \
+            libssl-dev \
+            uuid-dev \
+            libgpgme11-dev \
+            squashfs-tools \
+            libseccomp-dev \
+            pkg-config
+      - name: Install Singularity
+        env:
+          SINGULARITY_VERSION: 3.5.2
+        run: |
+          export GOPATH=/tmp/go
+          mkdir -p $GOPATH
+          sudo mkdir -p /usr/local/var/singularity/mnt && \
+          mkdir -p $GOPATH/src/github.com/sylabs && \
+          cd $GOPATH/src/github.com/sylabs && \
+          wget -qO- https://github.com/sylabs/singularity/releases/download/v${SINGULARITY_VERSION}/singularity-${SINGULARITY_VERSION}.tar.gz | \
+          tar xzv && \
+          cd singularity && \
+          ./mconfig -p /usr/local && \
+          make -C builddir && \
+          sudo make -C builddir install
+      - name: Install Nextflow
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+      - name: Pull containers 
+        run: |
+          echo "GITHUB_WORKSPACE: ${GITHUB_WORKSPACE}"
+          nextflow run ${GITHUB_WORKSPACE}/pull_containers.nf -ansi-log false -profile singularity
+      - name: Test workflow  (singularity)
+        run: |
+          NXF_VER=19.10.0 nextflow run ${GITHUB_WORKSPACE} -profile CI,singularity --max_cpus 2 --max_memory 4.GB -ansi-log false
diff --git a/README.md b/README.md
@@ -2,31 +2,29 @@
 
 [![GitHub commits since latest release](https://img.shields.io/github/commits-since/plantinformatics/pretzel-input-generator/latest.svg?style=for-the-badge&logo=github)](https://github.com/plantinformatics/pretzel-input-generator/releases)
 
-Note that this description lacks detail which will be added in future releases.
-
-# Pipeline overview
-
-`pretzel-input-generator` is a [nextflow](https://www.nextflow.io) pipeline for generating input for [pretzel](https://github.com/plantinformatics/pretzel) from annotated and (mostly) contiguous genome assemblies. The pipeline requires approximately 1 cpu-day, but as many processes can run independently, the real run-time is much shorter if suitable compute resources are available.
+![GitHub Workflow Status](https://img.shields.io/github/workflow/status/plantinformatics/pretzel-input-generator/CI?label=CI%20TESTS&logo=github&style=for-the-badge)
 
 
-<!-- TOC -->
-
+**Note that this README is partly out-of-date **
 - [Pipeline overview](#pipeline-overview)
 - [Default pipeline](#default-pipeline)
-  - [Quick start example using yeast data](#quick-start-example-using-yeast-data)
+  - [Quick start example using microsporidia data](#quick-start-example-using-microsporidia-data)
   - [Input specification (triticeae and other relevant data sets)](#input-specification-triticeae-and-other-relevant-data-sets)
-    - [Data sources](#data-sources)
-      - [Remote](#remote)
-      - [Local](#local)
-    - [Other considerations](#other-considerations)
+    - [Disparate triticeae datasets](#disparate-triticeae-datasets)
   - [Dependencies](#dependencies)
   - [Execution](#execution)
   - [Output](#output)
 - [BUSCO-based pipeline](#busco-based-pipeline)
   - [Quick-ish start](#quick-ish-start)
   - [Output](#output-1)
 
-<!-- /TOC -->
+
+# Pipeline overview
+
+`pretzel-input-generator` is a [nextflow](https://www.nextflow.io) pipeline for generating input for [pretzel](https://github.com/plantinformatics/pretzel) from annotated and (mostly) contiguous genome assemblies. The pipeline requires approximately ??? cpu-???, but as many processes can run independently, the real run-time is much shorter if suitable compute resources are available.
+
+
+<!-- TODO: re-generate TOC -->
 
 # Default pipeline
 
@@ -35,54 +33,34 @@ Designed for EnsemblPlants and similarly formatted data.
 ![doc/dag.png](doc/dag.png)
 
 
-## Quick start example using yeast data
+## Quick start example using microsporidia data
 
 
-Requires [nextflow](https://www.nextflow.io) and [Singularity](http://singularity.lbl.gov)
+Requires [nextflow](https://www.nextflow.io) and either [Singularity](http://singularity.lbl.gov)
 
 ```
 nextflow run plantinformatics/pretzel-input-generator \
--profile YEAST,singularity --max_cpus 2 --max_memory 2.GB 
+-profile MICROSPORIDIA,singularity --max_cpus 2 --max_memory 2.GB 
 ```
 
-This will pull and process data sets from [Ensembl](https://ensembl.org) specified in [`conf/ensembl-yeast.config`](conf/ensembl-yeast.config)
+This will pull and process data sets specified in [`conf/microsporidia.config`](conf/microsporidia.config)
 
-## Input specification (triticeae and other relevant data sets)
-
-Input files are specified in [conf/triticeae.config](conf/triticeae.config). This can be supplemented/replaced by JSON/YAML formatted input spec.
-
-### Data sources
 
-Currently all input data comes from the following sources:
-
-* [Ensembl plants](https://plants.ensembl.org) - multiple datasets as specified in [`conf/triticeae.config`](conf/triticeae.config) and
-* [International Wheat Genome Sequencing Consortium](https://www.wheatgenome.org/)
-  * [Triticum aestivum (Chinese Spring) IWGSC RefSeq v1.0 assembly](https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies)
-* [The wild emmer wheat sequencing consortium (WEWseq)](http://wewseq.wixsite.com/consortium)
-  * Zavitan assembly downloaded from [GrainGenes](https://wheat.pw.usda.gov/GG3/wildemmer)
-* [European Nucleotide Archive](https://www.ebi.ac.uk/ena)
-  * [Assembly of chromosome 2D of *Triticum aestivum* line CH Campala *Lr22a*](https://www.ebi.ac.uk/ena/data/view/LS480641)
-  * [Assembly of *Triticum urartu* ](https://www.ebi.ac.uk/ena/data/view/GCA_003073215)
-    * Annotation downloaded from [MBKBase](http://www.mbkbase.org/Tu/)
-  * [Assembly of *Aegilops tauschii* ](https://www.ebi.ac.uk/ena/data/view/GCA_002575655.1)
-    * Annotation downloaded from [http://aegilops.wheat.ucdavis.edu/ATGSP/annotation/](http://aegilops.wheat.ucdavis.edu/ATGSP/annotation/)
-* ...and more...
-*
-#### Remote
+## Input specification (triticeae and other relevant data sets)
 
-The pipeline pulls data from Ensembl, included species and assembly versions are specified in configuration file(s) e.g. [conf/ensembl-plants-data.config](conf/ensembl-plants-data.config).
-For each of the data sets the pipeline downloads:
+A mix of local and remote files can be specified - see [`conf/microsporidia.config`](conf/microsporidia.config) and the corresponding [`conf/test-data.config`](conf/test-data.config)
 
+There are several paths through the pipeline which are executed depending on input specification and availability of various input file types, e.g. 
 
-* genome assembly index file (required)
+* genome assembly index file 
 * protein sequences (required if pipeline is to generate aliases)
-* genome assembly fasta (only required if pipeline is to place markers on assemblies)
+* marker sequences
+* genome assembly fasta (required if pipeline is to place marker sequences on assemblies)
 
-#### Local
+Different paths through the pipeline rely on partly different inputs
 
-Different branches of the pipeline rely on partly different inputs
+1. Generation of genome blocks requires a genome assembly index file - all we really need are lengths of pseudo-chromosomes so a two-column `.tsv` file with chromosome names and their lengths will suffice. Also, if genome assembly fasta file is specified, the index will be generated automatically.
 
-1. Generation of genome blocks requires a genome assembly index file - all we really need are lengths of pseudo-chromosomes so a two-column `.tsv` file with chromosome names and their lengths will suffice
 2. Placement of gene features on the generated genome blocks and generation of aliases between features requires
 
   * gene annotations (either GTF or GFF3)
@@ -102,9 +80,10 @@ This follows how protein sequences are annotated on Ensembl plants, but we do no
 
 4. Marker placement requires full reference FASTA file.
 
-### Other considerations
 
-Wherever possible the local assembly files are used as input for the pipeline in their original form - as downloaded from their respective sources. This is however not always possible due to inconsistencies in formatting and varying levels of adherence to standards and conventions. We try to capture additional steps needed to prepare these input data sets for the inclusion in this pipeline in [doc/format_local.md](doc/format_local.md).
+### Disparate triticeae datasets 
+
+Wherever possible the assembly files are used as input for the pipeline in their original form - as downloaded from their respective sources. This is however not always possible due to inconsistencies in formatting and varying levels of adherence to standards and conventions. We try to capture additional steps needed to prepare these input data sets for the inclusion in this pipeline in [doc/format_local.md](doc/format_local.md).
 
 ## Dependencies
 
@@ -114,14 +93,13 @@ Wherever possible the local assembly files are used as input for the pipeline in
   * [Docker](http://singularity.lbl.gov)
   * Required software installed. In addition to standard linux tools, these include:
     * [FASTX-Toolkit](http://hannonlab.cshl.edu/fastx_toolkit/)
-    * [MMSeqs2](https://github.com/soedinglab/mmseqs2)
+    * [MMSeqs2](https://github.com/soedinglab/mmseqs2) - if generating aliases
     * [Minimap2](https://github.com/lh3/minimap2) - if placing markers
     * `jq`
     * `groovy` interpreter
 
-
 When using Singularity or Docker, the required containers are specified in [`conf/containers.conf`](conf/containers.config)
-
+and pulled by Nextflow as required.
 
 ## Execution
 
@@ -131,21 +109,21 @@ Run locally with docker
 
 ```
 nextflow run plantinformatics/pretzel-input-generator \
--profile YEAST,docker 
+-profile MICROSPORIDIA,docker 
 ```
 
 Run locally with singularity
 
 ```
 nextflow run plantinformatics/pretzel-input-generator \
--profile YEAST,singularity 
+-profile MICROSPORIDIA,singularity 
 ```
 
 Dispatch on a SLURM cluster with singularity
 
 ```
 nextflow run plantinformatics/pretzel-input-generator \
--profile YEAST,slurm,singularity
+-profile MICROSPORIDIA,slurm,singularity
 ```
 
 ## Output
@@ -162,13 +140,13 @@ All generated JSON files generated by the pipeline are output to `results/JSON`.
 
 The output files (hopefully) conform to the requirements of [pretzel data structure](https://github.com/plantinformatics/pretzel-data).
 
-
 The `results/flowinfo` directory contains summaries of pipeline execution and `results/downloads` includes the files downloaded from Ensembl plants.
 
 ```
 results
 ├── downloads
 ├── flowinfo
+├── summary
 └── JSON
 ```
 
@@ -193,4 +171,4 @@ This will pull and process data sets from [DNA Zoo](https://www.dnazoo.org/) spe
 
 ## Output
 
-In comparison with the main pipeline the output lacks `*_aliases.json.gz` as features on different genomes are implicitly connected by BUSCOs identifiers.
+In comparison with the main pipeline the output lacks `*_aliases.json.gz` as features on different genomes are implicitly connected by BUSCOs identifiers.
diff --git a/bin/excludeSameChromosome.awk b/bin/excludeSameChromosome.awk
@@ -1,9 +1,9 @@
-#!/usr/bin/awk -f
+#!/usr/bin/gawk -f
 
 BEGIN {
   OFS="\t";
 }
-NR==FNR && $3 ~/^chromosome/ {
+NR==FNR && $3 ~/^(chromosome|supercontig)/ {
   #gsub("^>","",$1)
   split($3,location,":");
   idmap[$1]=location[3];

diff --git a/bin/filterForRepresentative.awk b/bin/filterForRepresentative.awk
@@ -1,4 +1,4 @@
-#!/usr/bin/awk -f
+#!/usr/bin/gawk -f
 
 BEGIN {
   FS = "\t";
@@ -9,8 +9,8 @@ BEGIN {
   split($1,arr," "); #GET GENE FIELD
   split(arr[4],gene,":"); #GET GENE FIELD
   ID=gene[2]; #GET GENE ID
-  sub(/^>[^ ]+/, ">"ID); #USE GENE ID AS FASTA IDENTIFIER (NOT TRANSCRIPT ID)
-  if(!(ID in storedIDs) || length($2) > length(StoredSeqLines[ID])) { #FIRST OCCURANCE OT LONGER THAN STORED
+  sub(/^>[^ ]+/, ">"ID); #USE GENE ID AS FASTA IDENTIFIER (RATER THAN THE TRANSCRIPT ID)
+  if(!(ID in storedIDs) || length($2) > length(StoredSeqLines[ID])) { #FIRST OCCURANCE OR LONGER THAN STORED
     storedIdLInes[ID] = $1;
     # print "storing "$1
     storedSeqLines[ID] = $2;

diff --git a/bin/gff3AndRepr2ensembl_pep.awk b/bin/gff3AndRepr2ensembl_pep.awk
@@ -1,4 +1,4 @@
-#!/usr/bin/awk -f
+#!/usr/bin/gawk -f
 
 BEGIN {
   FS="\t";
@@ -15,24 +15,41 @@ NR==FNR {
 }
 
 NR!=FNR {
-  if($3 =="mRNA") {
+  if($3 =="CDS") {
     gsub("\"","");
     split($9,arr,";| ");
     for(i in arr) {
       split(arr[i], pair, "=");
       if(pair[1]=="ID") {
         transcript=pair[2];
       } else if(pair[1]=="Parent") {
-        gene=pair[2];
+        parent=pair[2];
+        gene=parent
         gsub(/\.[0-9]+$/,"",gene);
+      } else if(pair[1] ~ /^protein(_source)?_id$/)  {
+        source=pair[2];
       }
     }
-    if(transcript in repr && !(gene in printed)) {
+    # print "p="parent,"g="gene,"t="transcript,"s="source
+    # if(transcript in repr && !(gene in printed)) { 
+    if(!(parent in printed)) {
       #IGNORECASE=1;
       gsub(/chr_?/,"",$1);
       #IGNORECASE=0;
-      printed[gene]=1;
-      print ">"transcript" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[transcript];
+
+      if(source in repr) {
+        id = source
+      } else if(parent in repr) { #dealing with GFF files being inconsistent...
+        id = parent
+      } else {
+        id = ""
+      }
+    # if(source in repr && !(gene in printed)) {
+      # print ">"transcript" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[transcript];
+      if(id) {
+        print ">"id" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[id];       
+        printed[parent]=1;       
+      }
     }
   }
 }
diff --git a/bin/gtfAndRepr2ensembl_pep.awk b/bin/gtfAndRepr2ensembl_pep.awk
@@ -1,4 +1,4 @@
-#!/usr/bin/awk -f
+#!/usr/bin/gawk -f
 
 BEGIN {
   FS="\t";

diff --git a/bin/paf2pretzel.groovy b/bin/paf2pretzel.groovy
@@ -5,7 +5,9 @@ import java.util.zip.GZIPInputStream
 import java.util.zip.GZIPOutputStream
 
 
-@Grab('info.picocli:picocli:4.0.0-alpha-3') //command line interface
+//@Grab('info.picocli:picocli-groovy:4.1.2') //command line interface
+groovy.grape.Grape.grab(group:'info.picocli', module:'picocli-groovy', version:'4.1.2')
+
 @Command(header = [
        //Font Name: Calvin S
        $/@|bold,blue  ╔═╗╔═╗╔═╗  ┌┬┐┌─┐  ╔═╗┬─┐┌─┐┌┬┐┌─┐┌─┐┬   |@/$,
@@ -52,6 +54,9 @@ import static picocli.CommandLine.*
 @Option(names = ["--align-params"], description = ["Params used to generate input PAF alignments"])
 @Field private String alignParams
 
+@Option(names = ["--allowed-target-id-pattern"], description = ["Provide target identifier patter if other than common chromosome naming"])
+@Field private String allowedTargetIdPattern
+
 @Option(names = ["-O", "--output"], description = ["JSON output file name"])
 @Field private String output = '/dev/stdout'
 
@@ -112,8 +117,12 @@ pafContent.eachLine { line ->
     // println "${query_identity} >= ${minIdentity} ?"
     if(query_identity >= minIdentity) {
       def kosher = true;
-      if(!(TNAME.toLowerCase() ==~ /^(chr(omosome)?)?(_)?([0-9]+|x|y|i|v).*/)) {
+      // println "check if TNAME kosher"
+      // if(!(TNAME.toLowerCase() ==~ /^(ch(romosome)?)?(_)?([0-9]+|x|y|i|v).*/)) {
+      // if(!(TNAME.toLowerCase() ==~ /^(ch(romosome)?)?(_)?([0-9]+|x|y|i|v|[0-9a-z_\-]).*/)) {        
+      if(!((TNAME.toLowerCase() =~ /^(ch|[0-9]{1,2}|x|y|i|v)/) || (TNAME =~ allowedTargetIdPattern) )) {
         kosher = false //don't report placement on plasmid or other non-pseudomolecule parts of assembly
+        // println "${allowedTargetIdPattern} not matching $TNAME"
       } else if(markerMode && query_identity < 1) { //Not a 100% match, so for markers we check if no MM in last 3 bases - if notMarkerMode the required tag may not be present
         TAGS.each { tag ->
           if(tag.startsWith('cs:Z')) {
@@ -129,6 +138,7 @@ pafContent.eachLine { line ->
       }
 
       if(kosher) {
+        // println TNAME
         def key = TNAME.replaceFirst("^(C|c)(H|h)(R|r)[_]?","")
         if(!scope.containsKey(key)) {
           scope << [(key) : []]