Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Commit

Permalink
Merge branch 'rrnrf' into 'master'
Browse files Browse the repository at this point in the history
Recurrent Neural Network -- Random Fields

See merge request algorithm/scrappie!134
  • Loading branch information
tmassingham-ont committed Dec 9, 2017
2 parents c286f86 + f60db0e commit a251397
Show file tree
Hide file tree
Showing 13 changed files with 5,239 additions and 26 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ set (CPACK_PACKAGE_VENDOR "Oxford Nanopore Technologies")
set (CPACK_PACKAGE_DESCRIPTION_FILE "${PROJECT_SOURCE_DIR}/README.md")
set (CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE.md")
set (CPACK_PACKAGE_VERSION_MAJOR 1)
set (CPACK_PACKAGE_VERSION_MINOR 2)
set (CPACK_PACKAGE_VERSION_MINOR 3)
set (CPACK_PACKAGE_VERSION_PATCH 0)

# Get the latest abbreviated commit hash of the working branch
Expand Down Expand Up @@ -142,6 +142,7 @@ add_test(test_raw_call scrappie raw --model raw_r94 ${USE_THREADS} ${READSDIR})
add_test(test_rawrgr_call scrappie raw --model rgr_r94 ${USE_THREADS} ${READSDIR})
add_test(test_rawrgrgr_r94_call scrappie raw --model rgrgr_r94 ${USE_THREADS} ${READSDIR})
add_test(test_rawrgrgr_r95_call scrappie raw --model rgrgr_r95 ${USE_THREADS} ${READSDIR})
add_test(test_rawrnnrf_r94_call scrappie raw --model rnnrf_r94 ${USE_THREADS} ${READSDIR})
add_test(test_squiggle scrappie squiggle ${READSDIR}/test_squiggle.fa)
add_test(test_licence scrappie licence)
add_test(test_licence scrappie license)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ Scrappie basecaller -- basecall from raw signal
--local=penalty Penalty for local basecalling
-m, --min_prob=probability Minimum bound on probability of match
--model=name Raw model to use: "raw_r94", "rgr_r94",
"rgrgr_r94", "rgrgr_r95"
"rgrgr_r94", "rgrgr_r95", "rnnrf_r94"
-o, --output=filename Write to file rather than stdout
-p, --prefix=string Prefix to append to name of each read
-s, --skip=penalty Penalty for skipping a base
Expand Down
3 changes: 3 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Scrappie's purpose is to demonstrate the next generation of base calling and, as

# Release history
The intention is that behaviour will be stable within a series, with only bug fixes or minor improvements being applied. An improvement or change in behaviour that is not a major shift in the algorithm will be a new series with a bump of the minor version number. Any major changes in the algorithm will be a new series with the major number bumped.
* 1.3 series: Recurrent Neural Network - Random Field models
* * release-1.3.0*
* Initial support for RNN-RF models for raw data calling (R9.4 only).
* 1.2 series: Sequence-to-squiggle
* *release-1.2.0*
* Ability to predict expected squiggle from sequence
Expand Down
128 changes: 128 additions & 0 deletions misc/parse_rnnrf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python3
import argparse
import pickle
import math
import numpy as np
import re
import sys

parser = argparse.ArgumentParser()
parser.add_argument('--id', default='' , help='Identifier for model names')
parser.add_argument('model', help='Pickle to read model from')


trim_trailing_zeros = re.compile('0+p')

def small_hex(f):
hf = float(f).hex()
return trim_trailing_zeros.sub('p', hf)


def process_column(v, pad):
""" process and pad """
return [small_hex(f) for f in v] + [small_hex(0.0)] * pad


def cformatM(fh, name, X, nr=None, nc=None):
nrq = int(math.ceil(X.shape[1] / 4.0))
pad = nrq * 4 - X.shape[1]
lines = map(lambda v: ', '.join(process_column(v, pad)), X)

if nr is None:
nr = X.shape[1]
else:
nrq = int(math.ceil(nr / 4.0))
if nc is None:
nc = X.shape[0]

fh.write('float {}[] = {}\n'.format('__' + name, '{'))
fh.write('\t' + ',\n\t'.join(lines))
fh.write('};\n')
fh.write('_Mat {} = {}\n\t.nr = {},\n\t.nrq = {},\n\t.nc = {},\n\t.data.f = {}\n{};\n'.format('_' + name, '{', nr, nrq, nc, '__' + name, '}'))
fh.write('const scrappie_matrix {} = &{};\n\n'.format(name, '_' + name))


def cformatV(fh, name, X):
nrq = int(math.ceil(X.shape[0] / 4.0))
pad = nrq * 4 - X.shape[0]
lines = ', '.join(list(map(lambda f: small_hex(f), X)) + [small_hex(0.0)] * pad)
fh.write('float {}[] = {}\n'.format( '__' + name, '{'))
fh.write('\t' + lines)
fh.write('};\n')
fh.write('_Mat {} = {}\n\t.nr = {},\n\t.nrq = {},\n\t.nc = {},\n\t.data.f = {}\n{};\n'.format('_' + name, '{', X.shape[0], nrq, 1, '__' + name, '}'))
fh.write('const scrappie_matrix {} = &{};\n\n'.format(name, '_' + name))


if __name__ == '__main__':
args = parser.parse_args()
modelid = args.id + '_'

with open(args.model, 'rb') as fh:
network = pickle.load(fh, encoding='latin1')
assert network.major_version == 2, "Sloika model must be version 1 but model is {}.\nPerhaps you need to run Sloika's model_upgrade.py".format(network.version)

sys.stdout.write("""#pragma once
#ifndef NANONET_RNNRF_{}MODEL_H
#define NANONET_RNNRF_{}MODEL_H
#include <assert.h>
#include "../util.h"
""".format(modelid.upper(), modelid.upper()))

""" Convolution layer
"""

filterW = network.sublayers[0].W.get_value()
nfilter, _ , winlen = filterW.shape
cformatM(sys.stdout, 'conv_rnnrf_{}W'.format(modelid), filterW.reshape(-1, 1), nr = winlen * 4 - 3, nc=nfilter)
cformatV(sys.stdout, 'conv_rnnrf_{}b'.format(modelid), network.sublayers[0].b.get_value().reshape(-1))
sys.stdout.write("const int conv_rnnrf_{}stride = {};\n".format(modelid, network.sublayers[0].stride))
sys.stdout.write("""const size_t _conv_rnnrf_{}nfilter = {};
const size_t _conv_rnnrf_{}winlen = {};
""".format(modelid, nfilter, modelid, winlen))

""" Backward GRU (first layer)
"""
gru1 = network.sublayers[1].sublayers[0].sublayers[0]
cformatM(sys.stdout, 'gruB1_rnnrf_{}iW'.format(modelid), gru1.iW.get_value())
cformatM(sys.stdout, 'gruB1_rnnrf_{}sW'.format(modelid), gru1.sW.get_value())
cformatM(sys.stdout, 'gruB1_rnnrf_{}sW2'.format(modelid), gru1.sW2.get_value())
cformatV(sys.stdout, 'gruB1_rnnrf_{}b'.format(modelid), gru1.b.get_value().reshape(-1))

""" Forward GRU (second layer)
"""
gru2 = network.sublayers[2].sublayers[0]
cformatM(sys.stdout, 'gruF2_rnnrf_{}iW'.format(modelid), gru2.iW.get_value())
cformatM(sys.stdout, 'gruF2_rnnrf_{}sW'.format(modelid), gru2.sW.get_value())
cformatM(sys.stdout, 'gruF2_rnnrf_{}sW2'.format(modelid), gru2.sW2.get_value())
cformatV(sys.stdout, 'gruF2_rnnrf_{}b'.format(modelid), gru2.b.get_value().reshape(-1))

""" backward GRU(third layer)
"""
gru3 = network.sublayers[3].sublayers[0].sublayers[0]
cformatM(sys.stdout, 'gruB3_rnnrf_{}iW'.format(modelid), gru3.iW.get_value())
cformatM(sys.stdout, 'gruB3_rnnrf_{}sW'.format(modelid), gru3.sW.get_value())
cformatM(sys.stdout, 'gruB3_rnnrf_{}sW2'.format(modelid), gru3.sW2.get_value())
cformatV(sys.stdout, 'gruB3_rnnrf_{}b'.format(modelid), gru3.b.get_value().reshape(-1))

""" Forward GRU (fourth layer)
"""
gru4 = network.sublayers[4].sublayers[0]
cformatM(sys.stdout, 'gruF4_rnnrf_{}iW'.format(modelid), gru4.iW.get_value())
cformatM(sys.stdout, 'gruF4_rnnrf_{}sW'.format(modelid), gru4.sW.get_value())
cformatM(sys.stdout, 'gruF4_rnnrf_{}sW2'.format(modelid), gru4.sW2.get_value())
cformatV(sys.stdout, 'gruF4_rnnrf_{}b'.format(modelid), gru4.b.get_value().reshape(-1))

""" backward GRU(fifth layer)
"""
gru5 = network.sublayers[5].sublayers[0].sublayers[0]
cformatM(sys.stdout, 'gruB5_rnnrf_{}iW'.format(modelid), gru5.iW.get_value())
cformatM(sys.stdout, 'gruB5_rnnrf_{}sW'.format(modelid), gru5.sW.get_value())
cformatM(sys.stdout, 'gruB5_rnnrf_{}sW2'.format(modelid), gru5.sW2.get_value())
cformatV(sys.stdout, 'gruB5_rnnrf_{}b'.format(modelid), gru5.b.get_value().reshape(-1))
""" Global norm layer
"""
nstate = network.sublayers[6].W.get_value().shape[0]
cformatM(sys.stdout, 'FF_rnnrf_{}W'.format(modelid), network.sublayers[6].W.get_value())
cformatV(sys.stdout, 'FF_rnnrf_{}b'.format(modelid), network.sublayers[6].b.get_value())

sys.stdout.write('#endif /* NANONET_RNNRF_{}MODEL_H */'.format(modelid.upper()))
80 changes: 80 additions & 0 deletions src/decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -829,3 +829,83 @@ float sloika_viterbi(const_scrappie_matrix logpost, float stay_pen, float skip_p

return logscore;
}

float decode_crf(const_scrappie_matrix trans, int * path){
RETURN_NULL_IF(NULL == trans, NAN);
RETURN_NULL_IF(NULL == path, NAN);

const size_t nstate = roundf(sqrtf((float)trans->nr));
assert(nstate * nstate == trans->nr);
float * mem = calloc(2 * nstate, sizeof(float));
RETURN_NULL_IF(NULL==mem, NAN);

float * curr = mem;
float * prev = mem + nstate;

const size_t nblk = trans->nc;

scrappie_imatrix tb = make_scrappie_imatrix(nstate, nblk);


// Forwards Viterbi pass
for(size_t blk=0 ; blk < nblk ; blk++){
const size_t offset = blk * trans->nrq * 4;
const size_t tboffset = blk * tb->nrq * 4;
{ // Swap
float * tmp = curr;
curr = prev;
prev = tmp;
}

for(size_t st1=0 ; st1 < nstate ; st1++){
const size_t offsetS = offset + st1 * nstate;
curr[st1] = trans->data.f[offsetS + 0] + prev[0];
tb->data.f[tboffset + st1] = 0;
for(size_t st2=1 ; st2 < nstate ; st2++){
const float score = trans->data.f[offsetS + st2] + prev[st2];
if(score > curr[st1]){
curr[st1] = score;
tb->data.f[tboffset + st1] = st2;
}
}
}
}

// Traceback
const float score = valmaxf(curr, nstate);
path[nblk] = argmaxf(curr, nstate);
for(size_t blk=nblk ; blk > 0 ; blk--){
const size_t offset = (blk - 1) * tb->nrq * 4;
path[blk - 1] = tb->data.f[offset + path[blk]];
}

tb = free_scrappie_imatrix(tb);
free(mem);

return score;
}

char * crfpath_to_basecall(int const * path, size_t npos, int * pos){
RETURN_NULL_IF(NULL == path, NULL);
RETURN_NULL_IF(NULL == pos, NULL);

int nbase = 0;
for(size_t pos=0 ; pos < npos ; pos++){
if(path[pos] < NBASE){
nbase += 1;
}
}

char * basecall = calloc(nbase + 1, sizeof(char));
RETURN_NULL_IF(NULL == basecall, NULL);

for(size_t pos=0, bpos=0 ; pos < npos ; pos++){
if(path[pos] < NBASE){
assert(bpos < nbase);
basecall[bpos] = base_lookup[path[pos]];
bpos += 1;
}
}

return basecall;
}
3 changes: 3 additions & 0 deletions src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,7 @@ float sloika_viterbi(const_scrappie_matrix logpost, float stay_pen, float skip_p
float argmax_decoder(const_scrappie_matrix logpost, int *seq);
char *ctc_remove_stays_and_repeats(const int *seq, int n, int *pos);

float decode_crf(const_scrappie_matrix trans, int * path);
char * crfpath_to_basecall(int const * path, size_t npos, int * pos);

#endif /* DECODE_H */
75 changes: 75 additions & 0 deletions src/layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,24 @@ scrappie_matrix residual(const_scrappie_matrix X, const_scrappie_matrix fX, scra
return C;
}

void residual_inplace(const_scrappie_matrix X, scrappie_matrix fX) {
RETURN_NULL_IF(NULL == X, );
RETURN_NULL_IF(NULL == fX, );
const size_t nr = X->nr;
const size_t nrq = X->nrq;
const size_t nc = X->nc;
assert(nr == fX->nr);
assert(nrq == fX->nrq);
assert(nc == fX->nc);

for(size_t c=0 ; c < nc ; c++){
const size_t offset = c * nrq;
for(size_t r=0 ; r < nrq ; r++){
fX->data.v[offset + r] += X->data.v[offset + r];
}
}
}

scrappie_matrix softmax(const_scrappie_matrix X, const_scrappie_matrix W,
const_scrappie_matrix b, scrappie_matrix C) {
C = feedforward_exp(X, W, b, C);
Expand Down Expand Up @@ -641,3 +659,60 @@ void lstm_step(const_scrappie_matrix xAffine, const_scrappie_matrix out_prev,
* TANHFV(state->data.v[i]);
}
}


float crf_partition_function(const_scrappie_matrix C){
RETURN_NULL_IF(NULL == C, NAN);

const size_t nstate = roundf(sqrtf((float)C->nr));
assert(nstate * nstate == C->nr);
float * mem = calloc(2 * nstate, sizeof(float));
RETURN_NULL_IF(NULL==mem, NAN);

float * curr = mem;
float * prev = mem + nstate;

for(size_t c=0 ; c < C->nc ; c++){
const size_t offset = c * C->nrq * 4;
// Swap
{
float * tmp = curr;
curr = prev;
prev = tmp;
}
for(size_t st1=0 ; st1 < nstate ; st1++){
const size_t offsetS = offset + st1 * nstate;
curr[st1] = C->data.f[offsetS + 0] + prev[0];
for(size_t st2=1 ; st2 < nstate ; st2++){
curr[st1] = logsumexpf(curr[st1], C->data.f[offsetS + st2] + prev[st2]);
}
}
}

float logZ = curr[0];
for(size_t st=1 ; st < nstate ; st++){
logZ = logsumexpf(logZ, curr[st]);
}

free(mem);

return logZ;
}


scrappie_matrix globalnorm(const_scrappie_matrix X, const_scrappie_matrix W,
const_scrappie_matrix b, scrappie_matrix C) {
C = affine_map(X, W, b, C);
RETURN_NULL_IF(NULL == C, NULL);

float logZ = crf_partition_function(C) / (float)C->nc;

for(size_t c=0 ; c < C->nc ; c++){
const size_t offset = c * C->nrq * 4;
for(size_t r=0 ; r < C->nr ; r++){
C->data.f[offset + r] -= logZ;
}
}

return C;
}
6 changes: 6 additions & 0 deletions src/layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ scrappie_matrix feedforward_exp(const_scrappie_matrix X,
const_scrappie_matrix b, scrappie_matrix C);
scrappie_matrix residual(const_scrappie_matrix X, const_scrappie_matrix fX,
scrappie_matrix C);
void residual_inplace(const_scrappie_matrix X, scrappie_matrix fX);
scrappie_matrix softmax(const_scrappie_matrix X, const_scrappie_matrix W,
const_scrappie_matrix b, scrappie_matrix C);

Expand All @@ -52,4 +53,9 @@ void lstm_step(const_scrappie_matrix x, const_scrappie_matrix out_prev,
const_scrappie_matrix sW,
const_scrappie_matrix peep, scrappie_matrix xF,
scrappie_matrix state, scrappie_matrix output);


scrappie_matrix globalnorm(const_scrappie_matrix X, const_scrappie_matrix W,
const_scrappie_matrix b, scrappie_matrix C);
float crf_partition_function(const_scrappie_matrix C);
#endif /* LAYERS_H */
Loading

0 comments on commit a251397

Please sign in to comment.