Merge branch 'rrnrf' into 'master'

Recurrent Neural Network -- Random Fields See merge request algorithm/scrappie!134
nanoporetech · Dec 9, 2017 · a251397 · a251397
2 parents c286f86 + f60db0e
commit a251397
Show file tree

Hide file tree

Showing 13 changed files with 5,239 additions and 26 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,7 +15,7 @@ set (CPACK_PACKAGE_VENDOR "Oxford Nanopore Technologies")
 set (CPACK_PACKAGE_DESCRIPTION_FILE "${PROJECT_SOURCE_DIR}/README.md")
 set (CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE.md")
 set (CPACK_PACKAGE_VERSION_MAJOR 1)
-set (CPACK_PACKAGE_VERSION_MINOR 2)
+set (CPACK_PACKAGE_VERSION_MINOR 3)
 set (CPACK_PACKAGE_VERSION_PATCH 0)
 
 # Get the latest abbreviated commit hash of the working branch
@@ -142,6 +142,7 @@ add_test(test_raw_call scrappie raw --model raw_r94 ${USE_THREADS} ${READSDIR})
 add_test(test_rawrgr_call scrappie raw --model rgr_r94 ${USE_THREADS} ${READSDIR})
 add_test(test_rawrgrgr_r94_call scrappie raw --model rgrgr_r94 ${USE_THREADS} ${READSDIR})
 add_test(test_rawrgrgr_r95_call scrappie raw --model rgrgr_r95 ${USE_THREADS} ${READSDIR})
+add_test(test_rawrnnrf_r94_call scrappie raw --model rnnrf_r94 ${USE_THREADS} ${READSDIR})
 add_test(test_squiggle scrappie squiggle ${READSDIR}/test_squiggle.fa)
 add_test(test_licence scrappie licence)
 add_test(test_licence scrappie license)

diff --git a/README.md b/README.md
@@ -111,7 +111,7 @@ Scrappie basecaller -- basecall from raw signal
       --local=penalty        Penalty for local basecalling
   -m, --min_prob=probability Minimum bound on probability of match
       --model=name           Raw model to use: "raw_r94", "rgr_r94",
-                             "rgrgr_r94", "rgrgr_r95"
+                             "rgrgr_r94", "rgrgr_r95", "rnnrf_r94"
   -o, --output=filename      Write to file rather than stdout
   -p, --prefix=string        Prefix to append to name of each read
   -s, --skip=penalty         Penalty for skipping a base

diff --git a/RELEASES.md b/RELEASES.md
@@ -27,6 +27,9 @@ Scrappie's purpose is to demonstrate the next generation of base calling and, as
 
 # Release history
 The intention is that behaviour will be stable within a series, with only bug fixes or minor improvements being applied.  An improvement or change in behaviour that is not a major shift in the algorithm will be a new series with a bump of the minor version number.  Any major changes in the algorithm will be a new series with the major number bumped.
+* 1.3 series: Recurrent Neural Network - Random Field models
+    * * release-1.3.0*
+        * Initial support for RNN-RF models for raw data calling (R9.4 only).
 * 1.2 series: Sequence-to-squiggle
     * *release-1.2.0*
         * Ability to predict expected squiggle from sequence

diff --git a/misc/parse_rnnrf.py b/misc/parse_rnnrf.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import argparse
+import pickle
+import math
+import numpy as np
+import re
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--id', default='' , help='Identifier for model names')
+parser.add_argument('model', help='Pickle to read model from')
+
+
+trim_trailing_zeros = re.compile('0+p')
+
+def small_hex(f):
+    hf = float(f).hex()
+    return trim_trailing_zeros.sub('p', hf)
+
+
+def process_column(v, pad):
+    """ process and pad """
+    return [small_hex(f) for f in v] + [small_hex(0.0)] * pad
+
+
+def cformatM(fh, name, X, nr=None, nc=None):
+    nrq = int(math.ceil(X.shape[1] / 4.0))
+    pad = nrq * 4 - X.shape[1]
+    lines = map(lambda v: ', '.join(process_column(v, pad)), X)
+
+    if nr is None:
+        nr = X.shape[1]
+    else:
+        nrq = int(math.ceil(nr / 4.0))
+    if nc is None:
+        nc = X.shape[0]
+
+    fh.write('float {}[] = {}\n'.format('__' + name, '{'))
+    fh.write('\t' + ',\n\t'.join(lines))
+    fh.write('};\n')
+    fh.write('_Mat {} = {}\n\t.nr = {},\n\t.nrq = {},\n\t.nc = {},\n\t.data.f = {}\n{};\n'.format('_' + name, '{', nr, nrq, nc, '__' + name, '}'))
+    fh.write('const scrappie_matrix {} = &{};\n\n'.format(name, '_' + name))
+
+
+def cformatV(fh, name, X):
+    nrq = int(math.ceil(X.shape[0] / 4.0))
+    pad = nrq * 4 - X.shape[0]
+    lines = ', '.join(list(map(lambda f: small_hex(f), X)) + [small_hex(0.0)] * pad)
+    fh.write('float {}[] = {}\n'.format( '__' + name, '{'))
+    fh.write('\t' + lines)
+    fh.write('};\n')
+    fh.write('_Mat {} = {}\n\t.nr = {},\n\t.nrq = {},\n\t.nc = {},\n\t.data.f = {}\n{};\n'.format('_' + name, '{', X.shape[0], nrq, 1, '__' + name, '}'))
+    fh.write('const scrappie_matrix {} = &{};\n\n'.format(name, '_' + name))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    modelid = args.id + '_'
+
+    with open(args.model, 'rb') as fh:
+        network = pickle.load(fh, encoding='latin1')
+    assert network.major_version == 2, "Sloika model must be version 1 but model is {}.\nPerhaps you need to run Sloika's model_upgrade.py".format(network.version)
+
+    sys.stdout.write("""#pragma once
+    #ifndef NANONET_RNNRF_{}MODEL_H
+    #define NANONET_RNNRF_{}MODEL_H
+    #include <assert.h>
+    #include "../util.h"
+    """.format(modelid.upper(), modelid.upper()))
+
+    """ Convolution layer
+    """
+
+    filterW =  network.sublayers[0].W.get_value()
+    nfilter, _ , winlen = filterW.shape
+    cformatM(sys.stdout, 'conv_rnnrf_{}W'.format(modelid), filterW.reshape(-1, 1), nr = winlen * 4 - 3, nc=nfilter)
+    cformatV(sys.stdout, 'conv_rnnrf_{}b'.format(modelid), network.sublayers[0].b.get_value().reshape(-1))
+    sys.stdout.write("const int conv_rnnrf_{}stride = {};\n".format(modelid, network.sublayers[0].stride))
+    sys.stdout.write("""const size_t _conv_rnnrf_{}nfilter = {};
+    const size_t _conv_rnnrf_{}winlen = {};
+    """.format(modelid, nfilter, modelid, winlen))
+
+    """  Backward GRU (first layer)
+    """
+    gru1 = network.sublayers[1].sublayers[0].sublayers[0]
+    cformatM(sys.stdout, 'gruB1_rnnrf_{}iW'.format(modelid), gru1.iW.get_value())
+    cformatM(sys.stdout, 'gruB1_rnnrf_{}sW'.format(modelid), gru1.sW.get_value())
+    cformatM(sys.stdout, 'gruB1_rnnrf_{}sW2'.format(modelid), gru1.sW2.get_value())
+    cformatV(sys.stdout, 'gruB1_rnnrf_{}b'.format(modelid), gru1.b.get_value().reshape(-1))
+
+    """  Forward GRU (second layer)
+    """
+    gru2 = network.sublayers[2].sublayers[0]
+    cformatM(sys.stdout, 'gruF2_rnnrf_{}iW'.format(modelid), gru2.iW.get_value())
+    cformatM(sys.stdout, 'gruF2_rnnrf_{}sW'.format(modelid), gru2.sW.get_value())
+    cformatM(sys.stdout, 'gruF2_rnnrf_{}sW2'.format(modelid), gru2.sW2.get_value())
+    cformatV(sys.stdout, 'gruF2_rnnrf_{}b'.format(modelid), gru2.b.get_value().reshape(-1))
+
+    """ backward GRU(third layer)
+    """
+    gru3 = network.sublayers[3].sublayers[0].sublayers[0]
+    cformatM(sys.stdout, 'gruB3_rnnrf_{}iW'.format(modelid), gru3.iW.get_value())
+    cformatM(sys.stdout, 'gruB3_rnnrf_{}sW'.format(modelid), gru3.sW.get_value())
+    cformatM(sys.stdout, 'gruB3_rnnrf_{}sW2'.format(modelid), gru3.sW2.get_value())
+    cformatV(sys.stdout, 'gruB3_rnnrf_{}b'.format(modelid), gru3.b.get_value().reshape(-1))
+
+    """  Forward GRU (fourth layer)
+    """
+    gru4 = network.sublayers[4].sublayers[0]
+    cformatM(sys.stdout, 'gruF4_rnnrf_{}iW'.format(modelid), gru4.iW.get_value())
+    cformatM(sys.stdout, 'gruF4_rnnrf_{}sW'.format(modelid), gru4.sW.get_value())
+    cformatM(sys.stdout, 'gruF4_rnnrf_{}sW2'.format(modelid), gru4.sW2.get_value())
+    cformatV(sys.stdout, 'gruF4_rnnrf_{}b'.format(modelid), gru4.b.get_value().reshape(-1))
+
+    """ backward GRU(fifth layer)
+    """
+    gru5 = network.sublayers[5].sublayers[0].sublayers[0]
+    cformatM(sys.stdout, 'gruB5_rnnrf_{}iW'.format(modelid), gru5.iW.get_value())
+    cformatM(sys.stdout, 'gruB5_rnnrf_{}sW'.format(modelid), gru5.sW.get_value())
+    cformatM(sys.stdout, 'gruB5_rnnrf_{}sW2'.format(modelid), gru5.sW2.get_value())
+    cformatV(sys.stdout, 'gruB5_rnnrf_{}b'.format(modelid), gru5.b.get_value().reshape(-1))
+    """ Global norm layer
+    """
+    nstate = network.sublayers[6].W.get_value().shape[0]
+    cformatM(sys.stdout, 'FF_rnnrf_{}W'.format(modelid), network.sublayers[6].W.get_value())
+    cformatV(sys.stdout, 'FF_rnnrf_{}b'.format(modelid), network.sublayers[6].b.get_value())
+
+    sys.stdout.write('#endif /* NANONET_RNNRF_{}MODEL_H */'.format(modelid.upper()))
diff --git a/src/decode.c b/src/decode.c
@@ -829,3 +829,83 @@ float sloika_viterbi(const_scrappie_matrix logpost, float stay_pen, float skip_p
 
     return logscore;
 }
+
+float decode_crf(const_scrappie_matrix trans, int * path){
+    RETURN_NULL_IF(NULL == trans, NAN);
+    RETURN_NULL_IF(NULL == path, NAN);
+
+    const size_t nstate = roundf(sqrtf((float)trans->nr));
+    assert(nstate * nstate == trans->nr);
+    float * mem = calloc(2 * nstate, sizeof(float));
+    RETURN_NULL_IF(NULL==mem, NAN);
+
+    float * curr = mem;
+    float * prev = mem + nstate;
+
+    const size_t nblk = trans->nc;
+
+    scrappie_imatrix tb = make_scrappie_imatrix(nstate, nblk);
+
+
+    //  Forwards Viterbi pass
+    for(size_t blk=0 ; blk < nblk ; blk++){
+        const size_t offset = blk * trans->nrq * 4;
+        const size_t tboffset = blk * tb->nrq * 4;
+        {   // Swap
+            float * tmp = curr;
+            curr = prev;
+            prev = tmp;
+        }
+
+        for(size_t st1=0 ; st1 < nstate ; st1++){
+            const size_t offsetS = offset + st1 * nstate;
+            curr[st1] = trans->data.f[offsetS + 0] + prev[0];
+            tb->data.f[tboffset + st1] = 0;
+            for(size_t st2=1 ; st2 < nstate ; st2++){
+                const float score = trans->data.f[offsetS + st2] + prev[st2];
+                if(score > curr[st1]){
+                    curr[st1] = score;
+                    tb->data.f[tboffset + st1] = st2;
+                }
+            }
+        }
+    }
+
+    //  Traceback
+    const float score = valmaxf(curr, nstate);
+    path[nblk] = argmaxf(curr, nstate);
+    for(size_t blk=nblk ; blk > 0 ; blk--){
+        const size_t offset = (blk - 1) * tb->nrq * 4;
+        path[blk - 1] = tb->data.f[offset + path[blk]];
+    }
+
+    tb = free_scrappie_imatrix(tb);
+    free(mem);
+
+    return score;
+}
+
+char * crfpath_to_basecall(int const * path, size_t npos, int * pos){
+    RETURN_NULL_IF(NULL == path, NULL);
+    RETURN_NULL_IF(NULL == pos, NULL);
+
+    int nbase = 0;
+    for(size_t pos=0 ; pos < npos ; pos++){
+        if(path[pos] < NBASE){
+            nbase += 1;
+        }
+    }
+
+    char * basecall = calloc(nbase + 1, sizeof(char));
+    RETURN_NULL_IF(NULL == basecall, NULL);
+
+    for(size_t pos=0, bpos=0 ; pos < npos ; pos++){
+        if(path[pos] < NBASE){
+            assert(bpos < nbase);
+            basecall[bpos] = base_lookup[path[pos]];
+            bpos += 1;
+        }
+    }
+
+    return basecall;
+}
diff --git a/src/decode.h b/src/decode.h
@@ -24,4 +24,7 @@ float sloika_viterbi(const_scrappie_matrix logpost, float stay_pen, float skip_p
 float argmax_decoder(const_scrappie_matrix logpost, int *seq);
 char *ctc_remove_stays_and_repeats(const int *seq, int n, int *pos);
 
+float decode_crf(const_scrappie_matrix trans, int * path);
+char * crfpath_to_basecall(int const * path, size_t npos, int * pos);
+
 #endif                          /* DECODE_H */
diff --git a/src/layers.c b/src/layers.c
@@ -299,6 +299,24 @@ scrappie_matrix residual(const_scrappie_matrix X, const_scrappie_matrix fX, scra
     return C;
 }
 
+void residual_inplace(const_scrappie_matrix X, scrappie_matrix fX) {
+    RETURN_NULL_IF(NULL == X, );
+    RETURN_NULL_IF(NULL == fX, );
+    const size_t nr = X->nr;
+    const size_t nrq = X->nrq;
+    const size_t nc = X->nc;
+    assert(nr == fX->nr);
+    assert(nrq == fX->nrq);
+    assert(nc == fX->nc);
+
+    for(size_t c=0 ; c < nc ; c++){
+        const size_t offset = c * nrq;
+        for(size_t r=0 ; r < nrq ; r++){
+            fX->data.v[offset + r] += X->data.v[offset + r];
+        }
+    }
+}
+
 scrappie_matrix softmax(const_scrappie_matrix X, const_scrappie_matrix W,
                         const_scrappie_matrix b, scrappie_matrix C) {
     C = feedforward_exp(X, W, b, C);
@@ -641,3 +659,60 @@ void lstm_step(const_scrappie_matrix xAffine, const_scrappie_matrix out_prev,
             * TANHFV(state->data.v[i]);
     }
 }
+
+
+float crf_partition_function(const_scrappie_matrix C){
+    RETURN_NULL_IF(NULL == C, NAN);
+
+    const size_t nstate = roundf(sqrtf((float)C->nr));
+    assert(nstate * nstate == C->nr);
+    float * mem = calloc(2 * nstate, sizeof(float));
+    RETURN_NULL_IF(NULL==mem, NAN);
+
+    float * curr = mem;
+    float * prev = mem + nstate;
+
+    for(size_t c=0 ; c < C->nc ; c++){
+        const size_t offset = c * C->nrq * 4;
+        //  Swap
+        {
+            float * tmp = curr;
+            curr = prev;
+            prev = tmp;
+        }
+        for(size_t st1=0 ; st1 < nstate ; st1++){
+            const size_t offsetS = offset + st1 * nstate;
+            curr[st1] = C->data.f[offsetS + 0] + prev[0];
+            for(size_t st2=1 ; st2 < nstate ; st2++){
+                curr[st1] = logsumexpf(curr[st1],  C->data.f[offsetS + st2] + prev[st2]);
+            }
+        }
+    }
+
+    float logZ = curr[0];
+    for(size_t st=1 ; st < nstate ; st++){
+        logZ = logsumexpf(logZ, curr[st]);
+    }
+
+    free(mem);
+
+    return logZ;
+}
+
+
+scrappie_matrix globalnorm(const_scrappie_matrix X, const_scrappie_matrix W,
+                           const_scrappie_matrix b, scrappie_matrix C) {
+    C = affine_map(X, W, b, C);
+    RETURN_NULL_IF(NULL == C, NULL);
+
+    float logZ = crf_partition_function(C) / (float)C->nc;
+
+    for(size_t c=0 ; c < C->nc ; c++){
+        const size_t offset = c * C->nrq * 4;
+        for(size_t r=0 ; r < C->nr ; r++){
+            C->data.f[offset + r] -= logZ;
+        }
+    }
+
+    return C;
+}
diff --git a/src/layers.h b/src/layers.h
@@ -27,6 +27,7 @@ scrappie_matrix feedforward_exp(const_scrappie_matrix X,
                                 const_scrappie_matrix b, scrappie_matrix C);
 scrappie_matrix residual(const_scrappie_matrix X, const_scrappie_matrix fX,
                          scrappie_matrix C);
+void residual_inplace(const_scrappie_matrix X, scrappie_matrix fX);
 scrappie_matrix softmax(const_scrappie_matrix X, const_scrappie_matrix W,
                         const_scrappie_matrix b, scrappie_matrix C);
 
@@ -52,4 +53,9 @@ void lstm_step(const_scrappie_matrix x, const_scrappie_matrix out_prev,
                const_scrappie_matrix sW,
                const_scrappie_matrix peep, scrappie_matrix xF,
                scrappie_matrix state, scrappie_matrix output);
+
+
+scrappie_matrix globalnorm(const_scrappie_matrix X, const_scrappie_matrix W,
+                           const_scrappie_matrix b, scrappie_matrix C);
+float crf_partition_function(const_scrappie_matrix C);
 #endif                          /* LAYERS_H */