BioJulia · jakobnissen · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [UNRELEASED]
+## [3.2.0]
+* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
+  of an unknown alphabet from e.g. a string.
 * Relax requirement of `decode`, such that it no longer needs to check for
   invalid data. Note that this change is not breaking, since it is not possible
   for correctly-implemented `Alphabet` and `BioSequence` to store invalid data.

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <sabrinajward@protonmail.com>", "Jakob Nissen <jakobnybonissen@gmail.com>"]
-version = "3.1.6"
+version = "3.2.0"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

diff --git a/docs/src/construction.md b/docs/src/construction.md
@@ -302,6 +302,33 @@ the bodies of things like for loops. And if you use them and are unsure, use the
 @aa_str
 ```
 
+## Loose parsing
+As of version 3.2.0, BioSequences.jl provide the [`bioseq`](@ref) function, which can be used to build a `LongSequence`
+from a string (or an `AbstractVector{UInt8}`) without knowing the correct `Alphabet`.
+
+```jldoctest
+julia> bioseq("ATGTGCTGA")
+9nt DNA Sequence:
+ATGTGCTGA
+```
+
+The function will prioritise 2-bit alphabets over 4-bit alphabets, and prefer smaller alphabets (like `DNAAlphabet{4}`) over larger (like `AminoAcidAlphabet`).
+If the input cannot be encoded by any of the built-in alphabets, an error is thrown:
+
+```jldoctest
+julia> bioseq("0!(CC!;#&&%")
+ERROR: cannot encode 0x30 in AminoAcidAlphabet
+[...]
+```
+
+Note that this function is only intended to be used for interactive, ephemeral work.
+The function is necessarily type unstable, and the precise returned alphabet for a given input is a heuristic which is subject to change.
+
+```@docs
+bioseq
+guess_alphabet
+```
+
 ## Comparison to other sequence types
 Following Base standards, BioSequences do not compare equal to other containers even if they have the same elements.
 To e.g. compare a BioSequence with a vector of DNA, compare the elements themselves:

diff --git a/src/BioSequences.jl b/src/BioSequences.jl
@@ -95,6 +95,9 @@ export
     ### Alphabets
     ###
 
+    guess_alphabet,
+    bioseq,
+
     # Types & aliases
     Alphabet,
     NucleicAcidAlphabet,

diff --git a/src/alphabet.jl b/src/alphabet.jl
@@ -94,7 +94,7 @@
 EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)
 
 function Base.showerror(io::IO, err::EncodeError{A}) where {A}
-    print(io, "cannot encode ", err.val, " in ", A)
+    print(io, "cannot encode ", repr(err.val), " in ", A)
 end
 
 """
@@ -288,3 +288,75 @@
         ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1]
     end
 end
+
+const GUESS_ALPHABET_LUT = let
+    v = zeros(UInt8, 64)
+    for (offset, A) in [
+        (0, DNAAlphabet{2}()),
+        (0, RNAAlphabet{2}()),
+        (1, DNAAlphabet{4}()),
+        (1, RNAAlphabet{4}()),
+        (2, DNAAlphabet{4}()),
+        (3, AminoAcidAlphabet())
+    ]
+        for symbol in A
+            for byte in (UInt8(uppercase(Char(symbol))), UInt8(lowercase(Char(symbol))))
+                v[div(byte, 2) + 1] |= 0x01 << (4*isodd(byte) + offset)
+            end
+        end
+    end
+    Tuple(v)
+end
+
+"""
+    guess_alphabet(s::Union{AbstractString, AbstractVector{UInt8}}) -> Union{Integer, Alphabet}
+
+Pick an `Alphabet` that can encode input `s`.  If no `Alphabet` can, return the index of the first
+byte of the input which is not encodable in any alphabet.
+This function only knows about the alphabets listed below. If multiple alphabets are possible,
+pick the first from the order below (i.e. `DNAAlphabet{2}()` if possible, otherwise `RNAAlphabet{2}()` etc).
+1. `DNAAlphabet{2}()`
+2. `RNAAlphabet{2}()`
+3. `DNAAlphabet{4}()`
+4. `RNAAlphabet{4}()`
+5. `AminoAcidAlphabet()`
+
+!!! warning
+    The functions `bioseq` and `guess_alphabet` are intended for use in interactive
+    sessions, and are not suitable for use in packages or non-ephemeral work.
+    They are type unstable, and their heuristics **are subject to change** in minor versions.
+
+# Examples
+```jldoctest
+julia> guess_alphabet("AGGCA")
+DNAAlphabet{2}()
+
+julia> guess_alphabet("WKLQSTV")
+AminoAcidAlphabet()
+
+julia> guess_alphabet("QAWT+!")
+5
+
+julia> guess_alphabet("UAGCSKMU")
+RNAAlphabet{4}()
+```
+"""
+function guess_alphabet(v::AbstractVector{UInt8})
+    possibilities = 0x0f
+    for (index, byte) in pairs(v)
+        lut_byte = @inbounds GUESS_ALPHABET_LUT[div(byte & 0x7f, 2) + 0x01]
+        lut_value = (lut_byte >>> (4 * isodd(byte))) & 0x0f
+        possibilities &= (lut_value * (byte < 0x80))
+        iszero(possibilities) && return index
+    end
+    dna = !iszero(possibilities & 0b0100)
+    @assert !iszero(possibilities) # We checked that in the loop above
+    if !iszero(possibilities & 0b0001)
+        dna ? DNAAlphabet{2}() : RNAAlphabet{2}()
+    elseif !iszero(possibilities & 0b0010)
+        dna ? DNAAlphabet{4}() : RNAAlphabet{4}()
+    else
+        AminoAcidAlphabet()
+    end
+end
+guess_alphabet(s::AbstractString) = guess_alphabet(codeunits(s))
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -85,4 +85,38 @@ function LongSequence{A}(
     return copyto!(seq, 1, src, first(part), len)
 end
 
-Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+
+"""
+    bioseq(s::Union{AbstractString, AbstractVector{UInt8}}) -> LongSequence
+
+Parse `s` into a `LongSequence` with an appropriate `Alphabet`, or throw an exception
+if no alphabet matches.
+See [`guess_alphabet`](@ref) for the available alphabets and the alphabet priority.
+
+!!! warning
+    The functions `bioseq` and `guess_alphabet` are intended for use in interactive
+    sessions, and are not suitable for use in packages or non-ephemeral work.
+    They are type unstable, and their heuristics **are subject to change** in minor versions.
+
+# Examples
+```jldoctest
+julia> bioseq("QMKLPEEFW")
+9aa Amino Acid Sequence:
+QMKLPEEFW
+
+julia> bioseq("UAUGCUGUAGG")
+11nt RNA Sequence:
+UAUGCUGUAGG
+
+julia> bioseq("PKMW#3>>0;kL")
+ERROR: cannot encode 0x23 in AminoAcidAlphabet
+[...]
+```
+"""
+function bioseq(s::AbstractVector{UInt8})
+    A = guess_alphabet(s)
+    A isa Integer && throw(EncodeError(AminoAcidAlphabet(), s[A]))
+    LongSequence{typeof(A)}(s)
+end
+bioseq(s::AbstractString) = bioseq(codeunits(s))
diff --git a/test/alphabet.jl b/test/alphabet.jl
@@ -185,4 +185,31 @@ end
     @test BioSequences.has_interface(Alphabet, RNAAlphabet{2}())
     @test BioSequences.has_interface(Alphabet, RNAAlphabet{4}())
     @test BioSequences.has_interface(Alphabet, AminoAcidAlphabet())
+end
+
+@testset "Guess parsing and guess alphabet" begin
+    for (A, Ss) in [
+        (DNAAlphabet{2}(), ["", "TAGCT", "AAA"]),
+        (RNAAlphabet{2}(), ["U", "CGGUAG", "CCCCU"]),
+        (DNAAlphabet{4}(), ["W", "HKW--", "TAGCTATSG", "TAGC-TAG"]),
+        (RNAAlphabet{4}(), ["WU", "HAUH-CD", "VKMNSU"]),
+        (AminoAcidAlphabet(), ["Q", "PLBMW", "***"])
+    ]
+        for S in Ss
+            for T in [String, SubString, Vector{UInt8}, Test.GenericString]
+                @test guess_alphabet(T(S)) == A
+                @test bioseq(T(S)) isa LongSequence{typeof(A)}
+            end
+        end
+    end
+    for (index, S) in [
+        (4, "QMN!KK"),
+        (7, "ATCGAT???"),
+        (1, ";")
+    ]
+        for T in [String, SubString, Vector{UInt8}, Test.GenericString]
+            @test guess_alphabet(T(S)) == index
+            @test_throws BioSequences.EncodeError bioseq(T(S))
+        end
+    end
 end