From 93983f08fc68e1fa9ff2315dfe46a13d6c6192c0 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 20 Jun 2023 16:47:44 +0200 Subject: [PATCH] Add SpanMarker for NER to spaCy universe (#12730) * Add SpanMarker for NER to spaCy universe * Escape the newlines in the text in the code example Or at least, attempt to * Remove now unnecessary import * Disable NER pipeline component in code example --- website/meta/universe.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index a8ddd55f22c..cd3bedbff14 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4361,6 +4361,37 @@ }, "category": ["apis", "standalone"], "tags": ["apis", "deployment"] + }, + { + "id": "span_marker", + "title": "SpanMarker", + "slogan": "Effortless state-of-the-art NER in spaCy", + "description": "The SpanMarker integration with spaCy allows you to seamlessly replace the default spaCy `\"ner\"` pipeline component with any [SpanMarker model available on the Hugging Face Hub](https://huggingface.co/models?library=span-marker). Through this, you can take advantage of the advanced Named Entity Recognition capabilities of SpanMarker within the familiar and powerful spaCy framework.\n\nBy default, the `span_marker` pipeline component uses a [SpanMarker model using RoBERTa-large trained on OntoNotes v5.0](https://huggingface.co/tomaarsen/span-marker-roberta-large-ontonotes5). This model reaches a competitive 91.54 F1, notably higher than the [85.5 and 89.8 F1](https://spacy.io/usage/facts-figures#section-benchmarks) from `en_core_web_lg` and `en_core_web_trf`, respectively. A short head-to-head between this SpanMarker model and the `trf` spaCy model has been posted [here](https://github.com/tomaarsen/SpanMarkerNER/pull/12).\n\nAdditionally, see [here](https://tomaarsen.github.io/SpanMarkerNER/notebooks/spacy_integration.html) for documentation on using SpanMarker with spaCy.", + "github": "tomaarsen/SpanMarkerNER", + "pip": "span_marker", + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])", + "nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})", + "", + "text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\", + "Ptolemaic Kingdom of Egypt. She was born in 69 BCE and ruled Egypt from 51 BCE until her \\", + "death in 30 BCE.\"\"\"", + "doc = nlp(text)", + "print([(entity, entity.label_) for entity in doc.ents])", + "# [(Cleopatra VII, \"PERSON\"), (Cleopatra the Great, \"PERSON\"), (the Ptolemaic Kingdom of Egypt, \"GPE\"),", + "# (69 BCE, \"DATE\"), (Egypt, \"GPE\"), (51 BCE, \"DATE\"), (30 BCE, \"DATE\")]" + ], + "code_language": "python", + "url": "https://tomaarsen.github.io/SpanMarkerNER", + "author": "Tom Aarsen", + "author_links": { + "github": "tomaarsen", + "website": "https://www.linkedin.com/in/tomaarsen" + }, + "category": ["pipeline", "standalone", "scientific"], + "tags": ["ner"] } ],