xin-huang · xin-huang · Jul 8, 2022 · Dec 17, 2021
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -24,7 +24,7 @@ jobs:
         environment-file: conda-env.yml
     - name: Test with pytest
       run: |
-        micromamba run -n dadi-cli pytest --cov=. --cov-report term-missing
+        micromamba run -n dadi-cli pytest --cov=. --cov-report term-missing -vv
         micromamba run -n dadi-cli coverage xml
     - name: upload coverage report to codecov
       uses: codecov/codecov-action@v3

diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@ dist/*
 examples/snakepipe/*/.snakemake/*
 examples/snakepipe/*/1KG.*
 */.DS_Store
-*/__pycache__/*
+*__pycache__*
 .coverage
-tests/test_results/*.opts.*
+tests/test_results/
 wq-factory-*
diff --git a/.travis.yml b/.travis.yml
@@ -23,6 +23,7 @@ install:
     - python setup.py install
     - cd ../dadi-cli
     - python setup.py install
+    - conda install -y -c conda-forge nlopt
 script: pytest -vv --cov=. --cov-report term-missing
 after_success:
     - codecov
diff --git a/README.md b/README.md
diff --git a/conda-env.yml b/conda-env.yml
@@ -1,17 +1,22 @@
 name: dadi-cli
 channels:
-        - conda-forge
-        - bioconda
+  - conda-forge
+  - bioconda
 dependencies:
-        - dill==0.3.4
-        - codecov
-        - matplotlib
-        - ndcctools==7.4.3
-        - numpy
-        - pip
-        - pytest-cov
-        - python==3.8
-        - scipy
-        - snakemake==7.1.1
-        - pip:
-            - .
+  - codecov
+  - gcc
+  - gfortran
+  - demes
+  - dill==0.3.4
+  - matplotlib
+  - ndcctools==7.4.3
+  - nlopt
+  - numpy
+  - pip
+  - pytest-cov
+  - python==3.8
+  - scipy
+  - snakemake==7.1.1
+  - pip:
+      - git+https://bitbucket.org/gutenkunstlab/dadi@devel
+      - .
diff --git a/dadi_cli/BestFit.py b/dadi_cli/BestFit.py
@@ -0,0 +1,70 @@
+import glob, sys
+import numpy as np
+from dadi_cli.Models import get_model
+from dadi_cli.Pdfs import get_dadi_pdf_params
+
+def get_bestfit_params(path, lbounds, ubounds, output, delta, Nclose=3, Nbest=100):
+    files = glob.glob(path)
+    if files == []:
+        raise ValueError('No files or incorrect path naming (--input-prefix path name should end with InferDM).')
+    res, comments = [], []
+
+    for f in files:
+        fid = open(f, 'r')
+        for line in fid.readlines():
+            if line.startswith('#'):
+                if line.startswith('# Log(likelihood)'):
+                    params = line.rstrip()
+                else:
+                    comments.append(line.rstrip())
+                continue
+            # Parse numerical result
+            try:
+                res.append([float(_) for _ in line.rstrip().split()])
+            except ValueError:
+                # Ignore lines with a parsing error
+                pass
+        fid.close()
+
+    if len(res) == 0:
+        print('No optimization results found')
+        return
+
+    res = np.array(sorted(res, reverse=True))
+    opt_ll = res[0][0]
+    # Filter out those results within delta threshold
+    close_enough = res[1 - (opt_ll / res[:,0]) <= delta]
+
+    with open(output, 'w') as fid:
+        # Output command line
+        fid.write('# {0}\n'.format(' '.join(sys.argv)))
+        # Output all comment lines found
+        fid.write('\n'.join(comments) + '\n')
+
+        if len(close_enough) >= Nclose:
+            print("Converged")
+            if close2boundaries(close_enough[0][1:-1], lbounds, ubounds):
+                print("WARNING: The converged parameters are close to the boundaries")
+            # Spacer
+            fid.write('#\n# Converged results\n')
+            fid.write(params+'\n')
+            for result in close_enough:
+                fid.write('{0}\n'.format("\t".join([str(_) for _ in result])))
+        else:
+            print("No convergence")
+
+        fid.write('#\n# Top {0} results\n'.format(Nbest))
+        fid.write(params+'\n')
+        for result in res[:Nbest]:
+            fid.write('{0}\n'.format("\t".join([str(_) for _ in result])))
+
+    if len(close_enough) >= Nclose:
+        return close_enough
+
+def close2boundaries(params, lbounds, ubounds):
+    for i in range(len(params)):
+        if ubounds[i] is not None and lbounds[i] is not None:
+            bound_range = ubounds[i] - lbounds[i]
+            if (params[i] - lbounds[i])/bound_range < 0.01 or (ubounds[i] - params[i])/bound_range < 0.01:
+                return True
+    return False
diff --git a/src/GenerateCache.py → dadi_cli/GenerateCache.py b/src/GenerateCache.py → dadi_cli/GenerateCache.py
@@ -2,52 +2,53 @@
 import dadi.DFE as DFE
 import pickle, glob
 import numpy as np
-from src.Models import get_dadi_model_func
+from dadi_cli.Models import get_model
 
-def generate_cache(model, grids, popt, misid,
+def generate_cache(func, grids, popt,
                    gamma_bounds, gamma_pts, additional_gammas,
-                   output, sample_sizes, mp, cuda, single_gamma):
+                   output, sample_sizes, mp, cuda, dimensionality):
 
-    popt = _get_opt(popt, misid)
+    popt = _get_opt(popt)
 
     if cuda:
         dadi.cuda_enabled(True)
 
-    func = get_dadi_model_func(model, True, single_gamma)
     if grids == None:
         grids = [sample_sizes[0]+10, sample_sizes[0]+20, sample_sizes[0]+30]
 
-    #print(grids)
-
-    if single_gamma:
-       spectra = DFE.Cache1D(popt, sample_sizes, func, pts_l=grids, additional_gammas=additional_gammas, gamma_bounds=gamma_bounds, gamma_pts=gamma_pts, mp=mp) 
-    elif (model == 'equil') or (model == 'two_epoch') or (model == 'three_epoch'):
-       spectra = DFE.Cache1D(popt, sample_sizes, func, pts_l=grids, additional_gammas=additional_gammas, gamma_bounds=gamma_bounds, gamma_pts=gamma_pts, mp=mp) 
-    else:
+    if dimensionality == 1:
+       spectra = DFE.Cache1D(popt, sample_sizes, func, pts=grids, additional_gammas=additional_gammas, gamma_bounds=gamma_bounds, gamma_pts=gamma_pts, mp=mp) 
+    elif dimensionality == 2:
        spectra = DFE.Cache2D(popt, sample_sizes, func, pts=grids, additional_gammas=additional_gammas, gamma_bounds=gamma_bounds, gamma_pts=gamma_pts, mp=mp)
+    else:
+        raise ValueError("Incorrect value for --dimensionality")
 
     if (spectra.spectra<0).sum() > 0:
         print(
-            '!!!WARNING!!!\nPotentially large negative values!\nMost negative value is: '+str(spectra.spectra.min())+
-            '\nIf negative values are very negative (<-1), rerun with larger values for --grids'
+            f'!!!WARNING!!!\nPotentially large negative values!\nMost negative value is: {spectra.spectra.min()}'+
+            f'\nSum of negative entries is: {np.sum(spectra.spectra[spectra.spectra<0])}\nIf negative values are very negative (<-1), rerun with larger values for --grids'
             )
 
     fid = open(output, 'wb')
     pickle.dump(spectra, fid, protocol=2)
     fid.close()
 
-def _get_opt(popt, misid):
+# This function is very similar to dadi_cli.utilities._get_opts_and_theta.
+# However, because we want to always remove misid for cache generation
+# We need a custom function.
+def _get_opt(popt):
 
     opts = []
     params = []
     fid = open(popt, 'r')
     for line in fid.readlines():
         if line.startswith('#'):
-            if line.startswith('# L'): params.append(line.rstrip().split("\t"))
+            if line.startswith('# L'): params.extend(line.rstrip().split("\t"))
             continue
         else:
             try:
-                opts.append([float(_) for _ in line.rstrip().split()])
+                opts.extend([float(_) for _ in line.rstrip().split()])
+                break
             except ValueError:
                 pass
     fid.close()
@@ -60,12 +61,12 @@ def _get_opt(popt, misid):
     # The first parameter in the optimization results is the likelihood
     # The last parameter in the optimization results is theta
     # The misidentification is the second last parameter if exists
-    if misid: 
-        popt = opts[0][1:-2]
-        params = params[0][1:-2]
+    if 'misid' in params: 
+        popt = opts[1:-2]
+        params = params[1:-2]
     else: 
-        popt = opts[0][1:-1]
-        params = params[0][1:-1]
+        popt = opts[1:-1]
+        params = params[1:-1]
 
     print('The optimal parameters are:')
     print("\t".join([str(_) for _ in params]))

diff --git a/dadi_cli/GenerateFs.py b/dadi_cli/GenerateFs.py
@@ -0,0 +1,101 @@
+import dadi
+
+def generate_fs(vcf, output, pop_ids, pop_info, projections, subsample, polarized, marginalize_pops, bootstrap, chunk_size, masking, seed):
+    if subsample:
+        subsample_dict = {}
+        for i in range(len(pop_ids)):
+            subsample_dict[pop_ids[i]] = projections[i]
+        dd = dadi.Misc.make_data_dict_vcf(vcf_filename=vcf, popinfo_filename=pop_info, subsample=subsample_dict)
+    else:
+        dd = dadi.Misc.make_data_dict_vcf(vcf_filename=vcf, popinfo_filename=pop_info)
+    if bootstrap == None: 
+        fs = dadi.Spectrum.from_data_dict(dd, pop_ids=pop_ids, projections=projections, polarized=polarized)
+        if masking != '':
+            mask_entries(fs, masking)
+        if marginalize_pops != None: fs = marginalized_fs(fs, marginalize_pops, pop_ids)
+        fs.to_file(output)
+    else:
+        for b in range(bootstrap):
+            fs = generate_bootstrap_fs(dd, chunk_size, pop_ids, projections, polarized, seed)
+            if masking != '':
+                mask_entries(fs, masking)
+            if marginalize_pops != None: fs = marginalized_fs(fs, marginalize_pops, pop_ids)
+            fs.to_file(output + '.bootstrapping.' + str(b) + '.fs')
+
+def generate_bootstrap_fs(dd, chunk_size, pop_ids, projections, polarized, seed):
+    import random
+    if seed != None: random.seed(seed)
+    # split the dictionary by chromosome name
+    ndd = {}
+    for k in dd.keys():
+        chrname, pos = '_'.join(k.split("_")[:-1]), k.split("_")[-1]
+        if chrname not in ndd:
+            ndd[chrname] = {}
+        if pos not in ndd[chrname]:
+            ndd[chrname][int(pos)] = 1
+
+    # generate chunks with given chunk size
+    chunks = {}
+    for chrname in ndd.keys():
+        if chrname not in chunks:
+            chunks[chrname] = []
+        pos = sorted(ndd[chrname])
+        end = chunk_size
+        chunk_index = 0
+        chunks[chrname].append([])
+        for p in pos:
+            if p <= end: chunks[chrname][chunk_index].append(p)
+            else:
+                end += chunk_size
+                chunk_index += 1
+                chunks[chrname].append([])
+                chunks[chrname][chunk_index].append(p)
+
+    # sample the dictionary with replacement
+    bdd = {}
+    index = 0
+    for chrname in chunks.keys():
+        random_chunks = random.choices(range(len(chunks[chrname])), k=len(chunks[chrname]))
+        for chunk in random_chunks:
+            for pos in chunks[chrname][chunk]:
+                bdd.update({index: dd[chrname + "_" + str(pos)]})
+                index += 1
+    fs = dadi.Spectrum.from_data_dict(bdd, pop_ids=pop_ids, projections=projections, polarized=polarized)
+    return fs
+
+def mask_entries(fs, masking):
+    if len(fs.sample_sizes) == 1:
+        fs.mask[1] = True
+        fs.mask[-2] = True
+    elif len(fs.sample_sizes) == 2:
+        fs.mask[1,0] = True
+        fs.mask[-2,-1] = True
+        fs.mask[0,1] = True
+        fs.mask[-1,-2] = True
+        if masking=='shared':
+            fs.mask[1,1] = True
+            fs.mask[-2,-2] = True
+    elif len(fs.sample_sizes) == 3:
+        fs.mask[1,0,0] = True
+        fs.mask[-2,-1,-1] = True
+        fs.mask[0,1,0] = True
+        fs.mask[-1,-2,-1] = True
+        fs.mask[0,0,1] = True
+        fs.mask[-1,-1,-2] = True
+        if masking=='shared':
+            fs.mask[1,1,0] = True
+            fs.mask[1,0,1] = True
+            fs.mask[1,1,1] = True
+            fs.mask[-2,-2,-1] = True
+            fs.mask[-2,-1,-2] = True
+            fs.mask[-2,-2,-2] = True
+
+def marginalized_fs(fs, marginalize_pops, pop_ids):
+    marginalize_list = [pop_ids.index(pop) for pop in marginalize_pops]
+    return(fs.marginalize(marginalize_list))
+
+
+
+
+
+