diff --git a/backend/init.sql b/backend/init.sql index 5d505c5e..02b29458 100644 --- a/backend/init.sql +++ b/backend/init.sql @@ -11,6 +11,8 @@ -- Generated columns: -- https://www.postgresql.org/docs/current/ddl-generated-columns.html +CREATE EXTENSION pg_trgm; -- for trigram matching fuzzy search similarity() func + /* * Species Table */ @@ -28,8 +30,8 @@ CREATE TABLE proteins ( description text, length integer, -- length of amino acid sequence mass numeric, -- mass in amu/daltons - content bytea, -- stored markdown for the protein article (TODO: consider having a limit to how big this can be) - refs bytea, -- bibtex references mentioned in the content/article + content text, -- stored markdown for the protein article (TODO: consider having a limit to how big this can be) + refs text, -- bibtex references mentioned in the content/article species_id integer NOT NULL, thumbnail bytea, -- thumbnail image of the protein in base64 format FOREIGN KEY (species_id) REFERENCES species(id) ON UPDATE CASCADE ON DELETE CASCADE diff --git a/backend/src/api/protein.py b/backend/src/api/protein.py index f35f9a3e..2791af40 100644 --- a/backend/src/api/protein.py +++ b/backend/src/api/protein.py @@ -164,10 +164,6 @@ def get_protein_entry(protein_name: str): ) = only_returned_entry # if byte arrays are present, decode them into a string - if content is not None: - content = bytea_to_str(content) - if refs is not None: - refs = bytea_to_str(refs) if thumbnail is not None: thumbnail = bytea_to_str(thumbnail) @@ -263,8 +259,8 @@ def upload_protein_entry(body: UploadBody, req: Request): body.description, pdb.num_amino_acids, pdb.mass_daltons, - str_to_bytea(body.content), - str_to_bytea(body.refs), + body.content, + body.refs, body.species_name, ], ) @@ -310,7 +306,7 @@ def edit_protein_entry(body: EditBody, req: Request): db.execute( """UPDATE proteins SET content = %s WHERE name = %s""", [ - str_to_bytea(body.new_content), + body.new_content, body.old_name if not name_changed else body.new_name, ], ) @@ -319,7 +315,7 @@ def edit_protein_entry(body: EditBody, req: Request): db.execute( """UPDATE proteins SET refs = %s WHERE name = %s""", [ - str_to_bytea(body.new_refs), + body.new_refs, body.old_name if not name_changed else body.new_name, ], ) diff --git a/backend/src/api/search.py b/backend/src/api/search.py index b35488d7..5f0bc15e 100644 --- a/backend/src/api/search.py +++ b/backend/src/api/search.py @@ -74,41 +74,61 @@ def get_descriptions(protein_names: list[str]): def gen_sql_filters( + species_table: str, + proteins_table: str, species_filter: str | None, length_filter: RangeFilter | None = None, mass_filter: RangeFilter | None = None, ) -> str: filters = [ - category_where_clause("species.name", species_filter), - range_where_clause("proteins.length", length_filter), - range_where_clause("proteins.mass", mass_filter), + category_where_clause(f"{species_table}.name", species_filter), + range_where_clause(f"{proteins_table}.length", length_filter), + range_where_clause(f"{proteins_table}.mass", mass_filter), ] return " AND " + combine_where_clauses(filters) if any(filters) else "" @router.post("/search/proteins", response_model=SearchProteinsResults) def search_proteins(body: SearchProteinsBody): - title_query = sanitize_query(body.query) + text_query = sanitize_query(body.query) with Database() as db: try: filter_clauses = gen_sql_filters( - body.species_filter, body.length_filter, body.mass_filter + "species", + "proteins_scores", + body.species_filter, + body.length_filter, + body.mass_filter, ) - entries_query = """SELECT proteins.name, - proteins.description, - proteins.length, - proteins.mass, + threshold = 0 + score_filter = ( + f"(proteins_scores.name_score >= {threshold} OR proteins_scores.desc_score >= {threshold} OR proteins_scores.content_score >= {threshold})" # show only the scores > 0 + if len(text_query) > 0 + else "TRUE" # show all scores + ) + # cursed shit, edit this at some point + # note that we have a sub query since postgres can't do where clauses on aliased tables + entries_query = """SELECT proteins_scores.name, + proteins_scores.description, + proteins_scores.length, + proteins_scores.mass, species.name, - proteins.thumbnail - FROM proteins - JOIN species ON species.id = proteins.species_id - WHERE proteins.name ILIKE %s""" + proteins_scores.thumbnail + FROM (SELECT *, + similarity(name, %s) as name_score, + similarity(description, %s) as desc_score, + similarity(content, %s) as content_score + FROM proteins) as proteins_scores + JOIN species ON species.id = proteins_scores.species_id + WHERE {} {} + ORDER BY (proteins_scores.name_score*4 + proteins_scores.desc_score*2 + proteins_scores.content_score) DESC; + """.format( + score_filter, filter_clauses + ) # numbers in order by correspond to weighting log.warn(filter_clauses) entries_result = db.execute_return( - sanitize_query(entries_query + filter_clauses), - [ - f"%{title_query}%", - ], + sanitize_query(entries_query), + [text_query, text_query, text_query], ) if entries_result is not None: return SearchProteinsResults( diff --git a/galaxy/.gitignore b/galaxy/.gitignore new file mode 100644 index 00000000..6af1f0bd --- /dev/null +++ b/galaxy/.gitignore @@ -0,0 +1 @@ +master_venom_galaxy/ \ No newline at end of file