diff --git a/README.md b/README.md index d572c72..1faef82 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,8 @@ demo_output: str = sp.render(elements) print_first_n_lines(demo_output, n=7) ```
-TopLevelSectionTitle: PART I — FINANCIAL INFORMATION -TopLevelSectionTitle: Item 1. Financial Statements +TopSectionTitle: PART I — FINANCIAL INFORMATION +TopSectionTitle: Item 1. Financial Statements TitleElement: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited) SupplementaryText: (In millions, except number of ...housands and per share amounts) TableElement: Table with 24 rows, 80 numbers, and 1058 characters. @@ -176,8 +176,8 @@ demo_output: str = sp.render(tree) print_first_n_lines(demo_output, n=7) ```-TopLevelSectionTitle: PART I — FINANCIAL INFORMATION -├── TopLevelSectionTitle: Item 1. Financial Statements +TopSectionTitle: PART I — FINANCIAL INFORMATION +├── TopSectionTitle: Item 1. Financial Statements │ ├── TitleElement: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited) │ │ ├── SupplementaryText: (In millions, except number of ...housands and per share amounts) │ │ ├── TableElement: Table with 24 rows, 80 numbers, and 1058 characters. diff --git a/dev_utils/dashboard_app/view_parsed/export_as.py b/dev_utils/dashboard_app/view_parsed/export_as.py index b1f41c3..4fe563d 100644 --- a/dev_utils/dashboard_app/view_parsed/export_as.py +++ b/dev_utils/dashboard_app/view_parsed/export_as.py @@ -30,7 +30,7 @@ def render_view_parsed_export_as(elements, html: bytes, filename: str): title_level_to_markdown = _get_map_title_levels_to_markdown_headings(elements) output = [] for element in elements: - if isinstance(element, sp.TopLevelSectionTitle): + if isinstance(element, sp.TopSectionTitle): output.append(f"# {element.text}") elif isinstance(element, sp.TextElement): output.append(f"{element.text}") diff --git a/dev_utils/dashboard_app/view_parsed/overlay.py b/dev_utils/dashboard_app/view_parsed/overlay.py index 076b8f1..e31d94b 100644 --- a/dev_utils/dashboard_app/view_parsed/overlay.py +++ b/dev_utils/dashboard_app/view_parsed/overlay.py @@ -25,7 +25,7 @@ "z-index": 10, } COLOR_PAIR = [ - (sp.TopLevelSectionTitle, "rgba(247, 217, 196, 0.7)"), + (sp.TopSectionTitle, "rgba(247, 217, 196, 0.7)"), (sp.TitleElement, "rgba(250, 237, 203, 0.7)"), (sp.TextElement, "rgba(201, 228, 222, 0.7)"), (sp.ImageElement, "rgba(198, 222, 241, 0.7)"), diff --git a/docs/source/notebooks/developer_guide.ipynb b/docs/source/notebooks/developer_guide.ipynb index 2a698fd..b86773d 100644 --- a/docs/source/notebooks/developer_guide.ipynb +++ b/docs/source/notebooks/developer_guide.ipynb @@ -1,1315 +1,1315 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# This will auto-format your code. You can optionally install 'jupyter-black' using pip.\n", - "# Note: this cell is hidden from the HTML output. Read more: https://nbsphinx.readthedocs.io/en/0.2.1/hidden-cells.html\n", - "try:\n", - " import jupyter_black\n", - " jupyter_black.load()\n", - "except ImportError:\n", - " pass" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Developer Guide: Comprehensive Overview" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Welcome to the Comprehensive Developer Guide for `sec-parser`. This guide is designed to provide an in-depth understanding of the `sec-parser` project, whether you're a new developer looking to contribute, or an experienced one seeking to leverage its capabilities. We'll walk you through the codebase, explaining key components and their interactions, and provide examples to help you get started. \n", - "\n", - "This guide is interactive, allowing you to engage with the code and concepts as you learn. You can run and modify all the code examples shown here for yourself by cloning the repository and running the [developer_guide.ipynb](https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb) in a Jupyter notebook. \n", - "\n", - "Alternatively, you can also run the notebook directly in your browser using Cloud-based Jupyter environments:\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", - "[![My Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/alphanome-ai/sec-parser/main?filepath=docs/source/notebooks/developer_guide.ipynb)\n", - "[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/kernels/welcome?src=https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", - "[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", - "\n", - "Let's dive in!" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Environment Setup" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "In order to run the example code in this Guide, you'll need the `sec_parser` package:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " import sec_parser\n", - "except ImportError:\n", - " !pip install -q sec-parser\n", - " import sec_parser" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with a Simplified Example\n", - "\n", - "It will make it easier to follow along if we'll have a specific simplified example in mind. Consider the following HTML:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - " \n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{n+nt}{\\PYZlt{}b}\\PY{n+nt}{\\PYZgt{}}Financial\\PY{+w}{ }Overview\\PY{n+nt}{\\PYZlt{}/b\\PYZgt{}}\n", - "\\PY{n+nt}{\\PYZlt{}p}\\PY{n+nt}{\\PYZgt{}}The\\PY{+w}{ }financial\\PY{+w}{ }sector\\PY{+w}{ }is\\PY{+w}{ }a\\PY{+w}{ }category\\PY{+w}{ }of\\PY{+w}{ }the\\PY{+w}{ }economy\\PY{+w}{ }made\\PY{+w}{ }up\\PY{+w}{ }of\\PY{+w}{ }firms\\PY{+w}{ }that\\PY{+w}{ }provide\\PY{+w}{ }financial\\PY{+w}{ }services\\PY{+w}{ }to\\PY{+w}{ }commercial\\PY{+w}{ }and\\PY{+w}{ }retail\\PY{+w}{ }customers.\\PY{n+nt}{\\PYZlt{}/p\\PYZgt{}}\n", - "\\PY{n+nt}{\\PYZlt{}div}\\PY{n+nt}{\\PYZgt{}}\n", - "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}b}\\PY{n+nt}{\\PYZgt{}}Strategies\\PY{+w}{ }of\\PY{+w}{ }Investment\\PY{n+nt}{\\PYZlt{}/b\\PYZgt{}}\n", - "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}p}\\PY{n+nt}{\\PYZgt{}}Investment\\PY{+w}{ }strategies\\PY{n+nt}{\\PYZlt{}/font\\PYZgt{}}\\PY{+w}{ }are\\PY{+w}{ }plans\\PY{+w}{ }that\\PY{+w}{ }guide\\PY{+w}{ }investors\\PY{+w}{ }to\\PY{+w}{ }choose\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}font}\\PY{+w}{ }\\PY{n+na}{color=}\\PY{l+s}{\\PYZdq{}green\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{style=}\\PY{l+s}{\\PYZdq{}color:green\\PYZdq{}}\\PY{n+nt}{\\PYZgt{}}the\\PY{+w}{ }best\\PY{+w}{ }investment\\PY{+w}{ }opportunities\\PY{n+nt}{\\PYZlt{}/font\\PYZgt{}}\\PY{+w}{ }that\\PY{+w}{ }align\\PY{+w}{ }with\\PY{+w}{ }their\\PY{+w}{ }financial\\PY{+w}{ }goals.\\PY{n+nt}{\\PYZlt{}/p\\PYZgt{}}\n", - "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}img}\\PY{+w}{ }\\PY{n+na}{src=}\\PY{l+s}{\\PYZdq{}https://en.wikipedia.org/static/images/icons/wikipedia.png\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{width=}\\PY{l+s}{\\PYZdq{}20\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{height=}\\PY{l+s}{\\PYZdq{}20\\PYZdq{}}\\PY{n+nt}{\\PYZgt{}}\n", - "\\PY{n+nt}{\\PYZlt{}/div\\PYZgt{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "Financial Overview\n", - "<b>Financial Overview</b>\n", - "<p>The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.</p>\n", - "<div>\n", - " <b>Strategies of Investment</b>\n", - " <p>Investment strategies</font> are plans that guide investors to choose <font color="green" style="color:green">the best investment opportunities</font> that align with their financial goals.</p>\n", - " <img src="https://en.wikipedia.org/static/images/icons/wikipedia.png" width="20" height="20">\n", - "</div>\n", - "The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", - "\n", - " Strategies of Investment\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", - " \n", - "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# This will auto-format your code. You can optionally install 'jupyter-black' using pip.\n", + "# Note: this cell is hidden from the HTML output. Read more: https://nbsphinx.readthedocs.io/en/0.2.1/hidden-cells.html\n", + "try:\n", + " import jupyter_black\n", + " jupyter_black.load()\n", + "except ImportError:\n", + " pass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Developer Guide: Comprehensive Overview" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Welcome to the Comprehensive Developer Guide for `sec-parser`. This guide is designed to provide an in-depth understanding of the `sec-parser` project, whether you're a new developer looking to contribute, or an experienced one seeking to leverage its capabilities. We'll walk you through the codebase, explaining key components and their interactions, and provide examples to help you get started. \n", + "\n", + "This guide is interactive, allowing you to engage with the code and concepts as you learn. You can run and modify all the code examples shown here for yourself by cloning the repository and running the [developer_guide.ipynb](https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb) in a Jupyter notebook. \n", + "\n", + "Alternatively, you can also run the notebook directly in your browser using Cloud-based Jupyter environments:\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", + "[![My Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/alphanome-ai/sec-parser/main?filepath=docs/source/notebooks/developer_guide.ipynb)\n", + "[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/kernels/welcome?src=https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", + "[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/developer_guide.ipynb)\n", + "\n", + "Let's dive in!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "In order to run the example code in this Guide, you'll need the `sec_parser` package:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import sec_parser\n", + "except ImportError:\n", + " !pip install -q sec-parser\n", + " import sec_parser" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with a Simplified Example\n", + "\n", + "It will make it easier to follow along if we'll have a specific simplified example in mind. Consider the following HTML:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{n+nt}{\\PYZlt{}b}\\PY{n+nt}{\\PYZgt{}}Financial\\PY{+w}{ }Overview\\PY{n+nt}{\\PYZlt{}/b\\PYZgt{}}\n", + "\\PY{n+nt}{\\PYZlt{}p}\\PY{n+nt}{\\PYZgt{}}The\\PY{+w}{ }financial\\PY{+w}{ }sector\\PY{+w}{ }is\\PY{+w}{ }a\\PY{+w}{ }category\\PY{+w}{ }of\\PY{+w}{ }the\\PY{+w}{ }economy\\PY{+w}{ }made\\PY{+w}{ }up\\PY{+w}{ }of\\PY{+w}{ }firms\\PY{+w}{ }that\\PY{+w}{ }provide\\PY{+w}{ }financial\\PY{+w}{ }services\\PY{+w}{ }to\\PY{+w}{ }commercial\\PY{+w}{ }and\\PY{+w}{ }retail\\PY{+w}{ }customers.\\PY{n+nt}{\\PYZlt{}/p\\PYZgt{}}\n", + "\\PY{n+nt}{\\PYZlt{}div}\\PY{n+nt}{\\PYZgt{}}\n", + "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}b}\\PY{n+nt}{\\PYZgt{}}Strategies\\PY{+w}{ }of\\PY{+w}{ }Investment\\PY{n+nt}{\\PYZlt{}/b\\PYZgt{}}\n", + "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}p}\\PY{n+nt}{\\PYZgt{}}Investment\\PY{+w}{ }strategies\\PY{n+nt}{\\PYZlt{}/font\\PYZgt{}}\\PY{+w}{ }are\\PY{+w}{ }plans\\PY{+w}{ }that\\PY{+w}{ }guide\\PY{+w}{ }investors\\PY{+w}{ }to\\PY{+w}{ }choose\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}font}\\PY{+w}{ }\\PY{n+na}{color=}\\PY{l+s}{\\PYZdq{}green\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{style=}\\PY{l+s}{\\PYZdq{}color:green\\PYZdq{}}\\PY{n+nt}{\\PYZgt{}}the\\PY{+w}{ }best\\PY{+w}{ }investment\\PY{+w}{ }opportunities\\PY{n+nt}{\\PYZlt{}/font\\PYZgt{}}\\PY{+w}{ }that\\PY{+w}{ }align\\PY{+w}{ }with\\PY{+w}{ }their\\PY{+w}{ }financial\\PY{+w}{ }goals.\\PY{n+nt}{\\PYZlt{}/p\\PYZgt{}}\n", + "\\PY{+w}{ }\\PY{n+nt}{\\PYZlt{}img}\\PY{+w}{ }\\PY{n+na}{src=}\\PY{l+s}{\\PYZdq{}https://en.wikipedia.org/static/images/icons/wikipedia.png\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{width=}\\PY{l+s}{\\PYZdq{}20\\PYZdq{}}\\PY{+w}{ }\\PY{n+na}{height=}\\PY{l+s}{\\PYZdq{}20\\PYZdq{}}\\PY{n+nt}{\\PYZgt{}}\n", + "\\PY{n+nt}{\\PYZlt{}/div\\PYZgt{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "Financial Overview\n", + "<b>Financial Overview</b>\n", + "<p>The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.</p>\n", + "<div>\n", + " <b>Strategies of Investment</b>\n", + " <p>Investment strategies</font> are plans that guide investors to choose <font color="green" style="color:green">the best investment opportunities</font> that align with their financial goals.</p>\n", + " <img src="https://en.wikipedia.org/static/images/icons/wikipedia.png" width="20" height="20">\n", + "</div>\n", + "The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", + "\n", + " Strategies of Investment\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "Financial Overview\n", + "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", + " \n", + "The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", + "\n", + " Strategies of Investment\n", + "\n" + ], + "text/plain": [ + "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", + " \n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, HTML, Code\n", + "\n", + "html = \"\"\"\n", + "Financial Overview\n", + " The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", + "\n", + " Strategies of Investment\n", + "\n", + "\"\"\"\n", + "\n", + "display(Code(html))\n", + "display(HTML(html))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilizing BeautifulSoup for Parsing\n", + "Many SEC EDGAR filings are available in HTML document format. To ease the process of reading the documents, we will be using the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (\"bs4\") library to parse an HTML document into a tree-like structure of HTML Tags (`bs4.Tag`)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag 0: b (text: Financial ...)\n", + "Tag 1: p (text: The financ...)\n", + "Tag 2: div (text: Strategies...)\n" + ] + } + ], + "source": [ + "import bs4\n", + "\n", + "\n", + "# Utility function, ignore it\n", + "def get_children_tags(source) -> list[bs4.Tag]:\n", + " return [tag for tag in source.children if isinstance(tag, bs4.Tag)]\n", + "\n", + "\n", + "# Utility function, ignore it\n", + "def tag_to_string(tag):\n", + " text = tag.text.strip()\n", + " if len(text) > 0:\n", + " text = text[:10] + \"...\" if len(text) > 10 else text\n", + " return f\"{tag.name} (text: {text})\"\n", + " else:\n", + " return f\"{tag.name} (no text)\"\n", + "\n", + "\n", + "parse_result = bs4.BeautifulSoup(html, \"lxml\").html.body\n", + "bs4_tags = get_children_tags(parse_result)\n", + "for i, tag in enumerate(bs4_tags):\n", + " print(f\"Tag {i}: {tag_to_string(tag)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we `children` iterates only over the top-level tags. Children of children can be accessed by using `children` attribute again:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag 2 -> Tag 0: b (text: Strategies...))\n", + "Tag 2 -> Tag 1: p (text: Investment...))\n", + "Tag 2 -> Tag 2: img (no text))\n" + ] + } + ], + "source": [ + "for i, tag in enumerate(get_children_tags(bs4_tags[2])):\n", + " print(f\"Tag 2 -> Tag {i}: {tag_to_string(tag)})\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding the Role of HtmlTag\n", + "Instead of interacting directly with `bs4.Tag`, the SEC EDGAR HTML Parser uses `HtmlTag`, a wrapper around `bs4.Tag`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " The HtmlTag class is a wrapper for BeautifulSoup4 Tag objects.\n", + "\n", + " It serves three main purposes:\n", + "\n", + " 1. Decoupling: By abstracting the underlying BeautifulSoup4 library, we\n", + " can isolate our application logic from the library specifics. This\n", + " makes it easier to modify or even replace the HTML parsing library in\n", + " the future without extensive codebase changes.\n", + "\n", + " 2. Usability: The HtmlTag class provides a convenient location to add\n", + " extension methods or additional properties not offered by the native\n", + " BeautifulSoup4 Tag class. This enhances the usability of the class.\n", + "\n", + " 3. Caching: The HtmlTag class also caches processing results, improving\n", + " performance by avoiding unnecessary re-computation.\n", + " \n", + "\n", + " The HtmlTagParser parses an HTML document using BeautifulSoup4.\n", + " It then wraps the parsed bs4.Tag objects into HtmlTag objects.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.processing_engine import HtmlTag, HtmlTagParser\n", + "\n", + "print(HtmlTag.__doc__)\n", + "print(HtmlTagParser.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", + " \n", + "\n", + "Strategies of Investment\n", + "" + ], + "text/plain": [ + "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bs4_div_tag = bs4_tags[2]\n", + "display(HTML(str(bs4_div_tag)))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By applying `HtmlTag` to the `bs4.Tag` object, we can now access the `HtmlTag` attributes and methods that are not available in `bs4`. For example, we can get a percentage of green text:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The fraction of text within this div that is colored green: 21%\n" + ] + } + ], + "source": [ + "div_tag = HtmlTag(bs4_div_tag)\n", + "percentage = div_tag.get_text_styles_metrics()[(\"color\", \"green\")]\n", + "print(f\"The fraction of text within this div that is colored green: {percentage:.0f}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's wrap the rest of the tags in our example with `HtmlTag`:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "tags = [HtmlTag(bs4_tag) for bs4_tag in bs4_tags]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining Semantic Elements" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " In the domain of HTML parsing, especially in the context of SEC EDGAR documents,\n", + " a semantic element refers to a meaningful unit within the document that serves a\n", + " specific purpose. For example, a paragraph or a table might be considered a\n", + " semantic element. Unlike syntactic elements, which merely exist to structure the\n", + " HTML, semantic elements carry information that is vital to the understanding of the\n", + " document's content.\n", + "\n", + " This class serves as a foundational representation of such semantic elements,\n", + " containing an HtmlTag object that stores the raw HTML tag information. Subclasses\n", + " will implement additional behaviors based on the type of the semantic element.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.semantic_elements import AbstractSemanticElement\n", + "\n", + "print(AbstractSemanticElement.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few examples of Semantic Elements:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The TextElement class represents a standard text paragraph within a document.\n", + "The TableElement class represents a standard table within a document.\n", + "\n", + " The TitleElement class represents the title of a paragraph or other content object.\n", + " It serves as a semantic marker, providing context and structure to the document.\n", + " \n", + "\n", + " The TopSectionTitle class represents the title and the beginning of a top-level\n", + " section of a document. For instance, in SEC 10-Q reports, a\n", + " top-level section could be \"Part I, Item 3. Quantitative and Qualitative\n", + " Disclosures About Market Risk.\".\n", + " \n", + "\n", + " The NotYetClassifiedElement class represents an element whose type\n", + " has not yet been determined. The parsing process aims to\n", + " classify all instances of this class into more specific\n", + " subclasses of AbstractSemanticElement.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.semantic_elements import (\n", + " TextElement,\n", + " TableElement,\n", + " TitleElement,\n", + " TopSectionTitle,\n", + " NotYetClassifiedElement,\n", + ")\n", + "\n", + "print(TextElement.__doc__)\n", + "print(TableElement.__doc__)\n", + "print(TitleElement.__doc__)\n", + "print(TopSectionTitle.__doc__)\n", + "print(NotYetClassifiedElement.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To summarize, the purpose of parsing is to produce an ordered list of Semantic Elements from a tree of HTML Tags." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At the beginning of parsing the example we would have the following Semantic Elements:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotYetClassifiedElement (text: Financial ...)\n", + "NotYetClassifiedElement (text: The financ...)\n", + "NotYetClassifiedElement
(text: Strategies...)\n" + ] + } + ], + "source": [ + "# Utility function, ignore it\n", + "def show(elements):\n", + " for element in elements:\n", + " text = element.text[:10]\n", + " if hasattr(element, \"inner_elements\"):\n", + " print(f\"{element} (has {len(element.inner_elements)} elements inside)\")\n", + " elif text:\n", + " print(f\"{element} (text: {text}...)\")\n", + " else:\n", + " print(f\"{element}\")\n", + "\n", + "\n", + "initial_elements = [NotYetClassifiedElement(tag) for tag in tags]\n", + "show(initial_elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At the end of our parsing we expect to have the following Semantic Elements:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TitleElement[L0] (text: Financial ...)\n", + "TextElement(text: The financ...)\n", + "TitleElement[L0] (text: Strategies...)\n", + "TextElement
(text: Investment...)\n", + "ImageElement\n" + ] + } + ], + "source": [ + "from sec_parser import ImageElement\n", + "\n", + "expected_elements: list[AbstractSemanticElement] = [\n", + " TitleElement(tags[0]),\n", + " TextElement(tags[1]),\n", + " TitleElement(tags[2].get_children()[0]),\n", + " TextElement(tags[2].get_children()[1]),\n", + " ImageElement(tags[2].get_children()[2]),\n", + "]\n", + "show(expected_elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding the Parsing Process" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Responsible for parsing semantic elements from HTML documents.\n", + " It takes raw HTML and turns it into a list of objects\n", + " representing semantic elements.\n", + "\n", + " At a High Level:\n", + " ==================\n", + " 1. Extract top-level HTML tags from the document.\n", + " 2. Transform these tags into a list of more specific semantic\n", + " elements step-by-step.\n", + "\n", + " Why Focus on Top-Level Tags?\n", + " ============================\n", + " SEC filings usually have a flat HTML structure, which simplifies the\n", + " parsing process.Each top-level HTML tag often directly corresponds\n", + " to a single semantic element. This is different from many websites\n", + " where HTML tags are nested deeply,requiring more complex parsing.\n", + "\n", + " For Advanced Users:\n", + " ====================\n", + " The parsing process is implemented as a sequence of steps and allows for\n", + " customization at each step.\n", + "\n", + " - Pipeline Pattern: Raw HTML tags are processed in a sequential manner.\n", + " The steps follow an ordered, step-by-step approach, akin to a Finite\n", + " State Machine (FSM). Each element transitions through various states\n", + " defined by the sequence of processing steps.\n", + "\n", + " - Strategy Pattern: Each step is customizable. You can either replace,\n", + " remove, or extend any of the existing steps with your own or\n", + " inherited implementation. Alternatively, you can replace the entire pipeline\n", + " with your own process.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.processing_engine import AbstractSemanticElementParser\n", + "\n", + "print(AbstractSemanticElementParser.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Processing is organized in steps. If there are no steps, there will be no processing:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotYetClassifiedElement (text: Financial ...)\n", + "NotYetClassifiedElement
(text: The financ...)\n", + "NotYetClassifiedElement
(text: Strategies...)\n" + ] + } + ], + "source": [ + "from sec_parser import Edgar10QParser\n", + "\n", + "\n", + "def get_steps():\n", + " return []\n", + "\n", + "\n", + "parser = Edgar10QParser(get_steps)\n", + "elements = parser.parse(html)\n", + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, it is exactly the same as just wrapping the tags with `UndeterminedElement`:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotYetClassifiedElement (text: Financial ...)\n", + "NotYetClassifiedElement(text: The financ...)\n", + "NotYetClassifiedElement
(text: Strategies...)\n" + ] + } + ], + "source": [ + "show([NotYetClassifiedElement(tag) for tag in tags])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the first simple parsing step that naively identifies title and text tags." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MyClassifier: Successfully processed 3 tags!\n", + "\n" + ] + } + ], + "source": [ + "from sec_parser.processing_steps import AbstractProcessingStep\n", + "\n", + "\n", + "class MyClassifier(AbstractProcessingStep):\n", + " def __init__(self):\n", + " super().__init__()\n", + " # You can hold state in your processing steps\n", + " self.processed_tags_count = 0\n", + "\n", + " # This method must be implemented when inheriting from AbstractProcessingStep\n", + " def _process(self, elements):\n", + " parsed = []\n", + " for e in elements:\n", + " self.processed_tags_count += 1\n", + " if e.html_tag.name == \"b\":\n", + " parsed.append(TitleElement.create_from_element(e, \"\"))\n", + " elif e.html_tag.name == \"p\":\n", + " parsed.append(TextElement.create_from_element(e, \"\"))\n", + " else:\n", + " parsed.append(e)\n", + " print(\n", + " f\"MyClassifier: Successfully processed {self.processed_tags_count} tags!\\n\"\n", + " )\n", + " return parsed\n", + "\n", + "\n", + "def get_steps() -> list[AbstractProcessingStep]:\n", + " return [MyClassifier()]\n", + "\n", + "\n", + "parser = Edgar10QParser(get_steps)\n", + "elements = parser.parse(html)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TitleElement[L0] (text: Financial ...)\n", + "TextElement(text: The financ...)\n", + "NotYetClassifiedElement
(text: Strategies...)\n" + ] + } + ], + "source": [ + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The third tag cannot be identified as a single Semantic Element, let's see what can we do about it." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Handling Multiple Semantic Elements in a Single HTML Tag" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If multiple Semantic Elements are in the same HTML tag, we would first identify such cases by naming the element as `CompositeSemanticElement`." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " CompositeSemanticElement acts as a container for other semantic elements,\n", + " especially for cases where a single HTML root tag wraps multiple elements.\n", + " This ensures structural integrity and enables various features like\n", + " semantic segmentation visualization, and debugging by comparison with the\n", + " original document.\n", + "\n", + " Why is this useful:\n", + " ===================\n", + " 1. Some semantic elements, like XBRL tags (), may wrap multiple semantic\n", + " elements. The container ensures that these relationships are not broken\n", + " during parsing.\n", + " 2. Enables the parser to fully reconstruct the original HTML document, which\n", + " opens up possibilities for features like semantic segmentation visualization\n", + " (e.g. recreate the original document but put semi-transparent colored boxes\n", + " on top, based on semantic meaning), serialization of parsed documents into\n", + " an augmented HTML, and debugging by comparing to the original document.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.semantic_elements import CompositeSemanticElement\n", + "\n", + "print(CompositeSemanticElement.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "by creating a naive implementation of doing the identification:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotYetClassifiedElement (text: Financial ...)\n", + "NotYetClassifiedElement (text: The financ...)\n", + "CompositeSemanticElement
(has 3 elements inside)\n" + ] + } + ], + "source": [ + "class CompositeElementIdentificationStep(AbstractProcessingStep):\n", + " def _process(self, elements):\n", + " result = []\n", + " for e in elements:\n", + " if e.html_tag.name == \"div\":\n", + " result.append(\n", + " CompositeSemanticElement.create_from_element(\n", + " e,\n", + " inner_elements=[\n", + " NotYetClassifiedElement(t)\n", + " for t in e.html_tag.get_children()\n", + " ],\n", + " log_origin=\"CompositeElementIdentificationStep\",\n", + " )\n", + " )\n", + " else:\n", + " result.append(e)\n", + " return result\n", + "\n", + "\n", + "parser = Edgar10QParser(lambda: [CompositeElementIdentificationStep()])\n", + "elements = parser.parse(html, unwrap_elements=False)\n", + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have successfully identified the tag as a `CompositeSemanticElement`.\n", + "\n", + "However, `CompositeSemanticElement` is intended for more advanced use cases, normally we won't even notice it (we had to set `unwrap_elements` flag to `False` to see it):" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotYetClassifiedElement (text: Financial ...)\n", + "NotYetClassifiedElement(text: The financ...)\n", + "NotYetClassifiedElement (text: Strategies...)\n", + "NotYetClassifiedElement
(text: Investment...)\n", + "NotYetClassifiedElement\n" + ] + } + ], + "source": [ + "elements = parser.parse(html)\n", + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now combine the steps together. One steps output is another steps input, therefore order is important:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MyClassifier: Successfully processed 3 tags!\n", + "\n", + "TitleElement[L0] (text: Financial ...)\n", + "TextElement
(text: The financ...)\n", + "NotYetClassifiedElement (text: Strategies...)\n", + "NotYetClassifiedElement
(text: Investment...)\n", + "NotYetClassifiedElement\n" + ] + } + ], + "source": [ + "def get_steps():\n", + " return [\n", + " CompositeElementIdentificationStep(),\n", + " MyClassifier(),\n", + " ]\n", + "\n", + "\n", + "parser = Edgar10QParser(get_steps)\n", + "elements = parser.parse(html)\n", + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the inner elements of `CompositeSemanticElement` did not get processed. This is because it requires special handling. A simple way to do it would be to inherit from `ElementwiseProcessingStep`:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from sec_parser.processing_steps import AbstractElementwiseProcessingStep\n", + "\n", + "\n", + "class BetterClassifier(AbstractElementwiseProcessingStep):\n", + " def _process_element(self, element, context):\n", + " if element.html_tag.name == \"b\":\n", + " return TitleElement.create_from_element(element, \"\")\n", + " elif element.html_tag.name == \"p\":\n", + " return TextElement.create_from_element(element, \"\")\n", + " elif element.html_tag.name == \"img\":\n", + " return ImageElement.create_from_element(element, \"\")\n", + " return element" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TitleElement[L0] (text: Financial ...)\n", + "TextElement
(text: The financ...)\n", + "TitleElement[L0] (text: Strategies...)\n", + "TextElement
(text: Investment...)\n", + "ImageElement\n" + ] + } + ], + "source": [ + "def get_steps():\n", + " return [\n", + " CompositeElementIdentificationStep(),\n", + " BetterClassifier(),\n", + " ]\n", + "\n", + "\n", + "parser = Edgar10QParser(get_steps)\n", + "elements = parser.parse(html)\n", + "show(elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have completed the HTML parsing as the result looks the same as we intended:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TitleElement[L0] (text: Financial ...)\n", + "TextElement
(text: The financ...)\n", + "TitleElement[L0] (text: Strategies...)\n", + "TextElement
(text: Investment...)\n", + "ImageElement\n" + ] + } + ], + "source": [ + "show(expected_elements)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction to Semantic Trees" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Builds a semantic tree from a list of semantic elements.\n", + "\n", + " Why Use a Tree Structure?\n", + " =========================\n", + " Using a tree data structure allows for easier and more robust filtering of sections.\n", + " With a tree, you can select specific branches to filter, making it straightforward\n", + " to identify section boundaries. This approach is more maintainable and robust\n", + " compared to attempting the same operations on a flat list of elements.\n", + "\n", + " Overview:\n", + " =========\n", + " 1. Takes a list of semantic elements.\n", + " 2. Applies nesting rules to these elements.\n", + "\n", + " Customization:\n", + " ==============\n", + " The nesting process is customizable through a list of rules. These rules determine\n", + " how new elements should be nested under existing ones.\n", + "\n", + " Advanced Customization:\n", + " =======================\n", + " You can supply your own set of rules by providing a callable to `get_rules`, which\n", + " should return a list of `AbstractNestingRule` instances.\n", + " \n" + ] + } + ], + "source": [ + "from sec_parser.semantic_tree import TreeBuilder\n", + "\n", + "print(TreeBuilder.__doc__)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Let's apply this to our example:***" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A very similar processing pattern is used here as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTitleElement\u001b[0m: Financial Overview\n", + "└── \u001b[1;34mTextElement\u001b[0m: The financial sector is a categ...ommercial and retail customers.\n", + "\u001b[1;34mTitleElement\u001b[0m: Strategies of Investment\n", + "├── \u001b[1;34mTextElement\u001b[0m: Investment strategies are plans...ign with their financial goals.\n", + "└── \u001b[1;34mImageElement\u001b[0m\n" + ] + } + ], + "source": [ + "from sec_parser.semantic_tree import AlwaysNestAsParentRule, AbstractNestingRule, render\n", + "\n", + "\n", + "def get_rules() -> list[AbstractNestingRule]:\n", + " return [\n", + " AlwaysNestAsParentRule(TitleElement),\n", + " ]\n", + "\n", + "\n", + "builder = TreeBuilder(get_rules)\n", + "tree = builder.build(elements)\n", + "print(render(list(tree)))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTitleElement\u001b[0m: Financial Overview\n", + "└── \u001b[1;34mTextElement\u001b[0m: The financial sector is a categ...ommercial and retail customers.\n" + ] + } + ], + "source": [ + "print(render(list(tree)[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTitleElement\u001b[0m: Strategies of Investment\n", + "├── \u001b[1;34mTextElement\u001b[0m: Investment strategies are plans...ign with their financial goals.\n", + "└── \u001b[1;34mImageElement\u001b[0m\n" + ] + } + ], + "source": [ + "print(render(list(tree)[1]))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For further understanding of `sec-parser`, refer to the [**Documentation**](https://sec-parser.rtfd.io). If you're interested in contributing, consider checking out our [**Contribution Guide**](https://github.com/alphanome-ai/sec-parser/blob/main/CONTRIBUTING.md)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 }, - { - "data": { - "text/html": [ - "\n", - "Financial Overview\n", - "
The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", - "\n", - " Strategies of Investment\n", - "\n" - ], - "text/plain": [ - "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", - " \n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from IPython.display import display, HTML, Code\n", - "\n", - "html = \"\"\"\n", - "Financial Overview\n", - " The financial sector is a category of the economy made up of firms that provide financial services to commercial and retail customers.
\n", - "\n", - " Strategies of Investment\n", - "\n", - "\"\"\"\n", - "\n", - "display(Code(html))\n", - "display(HTML(html))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilizing BeautifulSoup for Parsing\n", - "Many SEC EDGAR filings are available in HTML document format. To ease the process of reading the documents, we will be using the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (\"bs4\") library to parse an HTML document into a tree-like structure of HTML Tags (`bs4.Tag`)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag 0: b (text: Financial ...)\n", - "Tag 1: p (text: The financ...)\n", - "Tag 2: div (text: Strategies...)\n" - ] - } - ], - "source": [ - "import bs4\n", - "\n", - "\n", - "# Utility function, ignore it\n", - "def get_children_tags(source) -> list[bs4.Tag]:\n", - " return [tag for tag in source.children if isinstance(tag, bs4.Tag)]\n", - "\n", - "\n", - "# Utility function, ignore it\n", - "def tag_to_string(tag):\n", - " text = tag.text.strip()\n", - " if len(text) > 0:\n", - " text = text[:10] + \"...\" if len(text) > 10 else text\n", - " return f\"{tag.name} (text: {text})\"\n", - " else:\n", - " return f\"{tag.name} (no text)\"\n", - "\n", - "\n", - "parse_result = bs4.BeautifulSoup(html, \"lxml\").html.body\n", - "bs4_tags = get_children_tags(parse_result)\n", - "for i, tag in enumerate(bs4_tags):\n", - " print(f\"Tag {i}: {tag_to_string(tag)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that we `children` iterates only over the top-level tags. Children of children can be accessed by using `children` attribute again:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag 2 -> Tag 0: b (text: Strategies...))\n", - "Tag 2 -> Tag 1: p (text: Investment...))\n", - "Tag 2 -> Tag 2: img (no text))\n" - ] - } - ], - "source": [ - "for i, tag in enumerate(get_children_tags(bs4_tags[2])):\n", - " print(f\"Tag 2 -> Tag {i}: {tag_to_string(tag)})\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding the Role of HtmlTag\n", - "Instead of interacting directly with `bs4.Tag`, the SEC EDGAR HTML Parser uses `HtmlTag`, a wrapper around `bs4.Tag`." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " The HtmlTag class is a wrapper for BeautifulSoup4 Tag objects.\n", - "\n", - " It serves three main purposes:\n", - "\n", - " 1. Decoupling: By abstracting the underlying BeautifulSoup4 library, we\n", - " can isolate our application logic from the library specifics. This\n", - " makes it easier to modify or even replace the HTML parsing library in\n", - " the future without extensive codebase changes.\n", - "\n", - " 2. Usability: The HtmlTag class provides a convenient location to add\n", - " extension methods or additional properties not offered by the native\n", - " BeautifulSoup4 Tag class. This enhances the usability of the class.\n", - "\n", - " 3. Caching: The HtmlTag class also caches processing results, improving\n", - " performance by avoiding unnecessary re-computation.\n", - " \n", - "\n", - " The HtmlTagParser parses an HTML document using BeautifulSoup4.\n", - " It then wraps the parsed bs4.Tag objects into HtmlTag objects.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.processing_engine import HtmlTag, HtmlTagParser\n", - "\n", - "print(HtmlTag.__doc__)\n", - "print(HtmlTagParser.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", - " \n", - "\n", - "Strategies of Investment\n", - "" - ], - "text/plain": [ - "Investment strategies are plans that guide investors to choose the best investment opportunities that align with their financial goals.
\n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "bs4_div_tag = bs4_tags[2]\n", - "display(HTML(str(bs4_div_tag)))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By applying `HtmlTag` to the `bs4.Tag` object, we can now access the `HtmlTag` attributes and methods that are not available in `bs4`. For example, we can get a percentage of green text:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The fraction of text within this div that is colored green: 21%\n" - ] - } - ], - "source": [ - "div_tag = HtmlTag(bs4_div_tag)\n", - "percentage = div_tag.get_text_styles_metrics()[(\"color\", \"green\")]\n", - "print(f\"The fraction of text within this div that is colored green: {percentage:.0f}%\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's wrap the rest of the tags in our example with `HtmlTag`:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "tags = [HtmlTag(bs4_tag) for bs4_tag in bs4_tags]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Defining Semantic Elements" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " In the domain of HTML parsing, especially in the context of SEC EDGAR documents,\n", - " a semantic element refers to a meaningful unit within the document that serves a\n", - " specific purpose. For example, a paragraph or a table might be considered a\n", - " semantic element. Unlike syntactic elements, which merely exist to structure the\n", - " HTML, semantic elements carry information that is vital to the understanding of the\n", - " document's content.\n", - "\n", - " This class serves as a foundational representation of such semantic elements,\n", - " containing an HtmlTag object that stores the raw HTML tag information. Subclasses\n", - " will implement additional behaviors based on the type of the semantic element.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.semantic_elements import AbstractSemanticElement\n", - "\n", - "print(AbstractSemanticElement.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few examples of Semantic Elements:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The TextElement class represents a standard text paragraph within a document.\n", - "The TableElement class represents a standard table within a document.\n", - "\n", - " The TitleElement class represents the title of a paragraph or other content object.\n", - " It serves as a semantic marker, providing context and structure to the document.\n", - " \n", - "\n", - " The TopLevelSectionTitle class represents the title and the beginning of a top-level\n", - " section of a document. For instance, in SEC 10-Q reports, a\n", - " top-level section could be \"Part I, Item 3. Quantitative and Qualitative\n", - " Disclosures About Market Risk.\".\n", - " \n", - "\n", - " The NotYetClassifiedElement class represents an element whose type\n", - " has not yet been determined. The parsing process aims to\n", - " classify all instances of this class into more specific\n", - " subclasses of AbstractSemanticElement.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.semantic_elements import (\n", - " TextElement,\n", - " TableElement,\n", - " TitleElement,\n", - " TopLevelSectionTitle,\n", - " NotYetClassifiedElement,\n", - ")\n", - "\n", - "print(TextElement.__doc__)\n", - "print(TableElement.__doc__)\n", - "print(TitleElement.__doc__)\n", - "print(TopLevelSectionTitle.__doc__)\n", - "print(NotYetClassifiedElement.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To summarize, the purpose of parsing is to produce an ordered list of Semantic Elements from a tree of HTML Tags." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At the beginning of parsing the example we would have the following Semantic Elements:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NotYetClassifiedElement (text: Financial ...)\n", - "NotYetClassifiedElement (text: The financ...)\n", - "NotYetClassifiedElement
(text: Strategies...)\n" - ] - } - ], - "source": [ - "# Utility function, ignore it\n", - "def show(elements):\n", - " for element in elements:\n", - " text = element.text[:10]\n", - " if hasattr(element, \"inner_elements\"):\n", - " print(f\"{element} (has {len(element.inner_elements)} elements inside)\")\n", - " elif text:\n", - " print(f\"{element} (text: {text}...)\")\n", - " else:\n", - " print(f\"{element}\")\n", - "\n", - "\n", - "initial_elements = [NotYetClassifiedElement(tag) for tag in tags]\n", - "show(initial_elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At the end of our parsing we expect to have the following Semantic Elements:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TitleElement[L0] (text: Financial ...)\n", - "TextElement(text: The financ...)\n", - "TitleElement[L0] (text: Strategies...)\n", - "TextElement
(text: Investment...)\n", - "ImageElement\n" - ] - } - ], - "source": [ - "from sec_parser import ImageElement\n", - "\n", - "expected_elements: list[AbstractSemanticElement] = [\n", - " TitleElement(tags[0]),\n", - " TextElement(tags[1]),\n", - " TitleElement(tags[2].get_children()[0]),\n", - " TextElement(tags[2].get_children()[1]),\n", - " ImageElement(tags[2].get_children()[2]),\n", - "]\n", - "show(expected_elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Understanding the Parsing Process" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " Responsible for parsing semantic elements from HTML documents.\n", - " It takes raw HTML and turns it into a list of objects\n", - " representing semantic elements.\n", - "\n", - " At a High Level:\n", - " ==================\n", - " 1. Extract top-level HTML tags from the document.\n", - " 2. Transform these tags into a list of more specific semantic\n", - " elements step-by-step.\n", - "\n", - " Why Focus on Top-Level Tags?\n", - " ============================\n", - " SEC filings usually have a flat HTML structure, which simplifies the\n", - " parsing process.Each top-level HTML tag often directly corresponds\n", - " to a single semantic element. This is different from many websites\n", - " where HTML tags are nested deeply,requiring more complex parsing.\n", - "\n", - " For Advanced Users:\n", - " ====================\n", - " The parsing process is implemented as a sequence of steps and allows for\n", - " customization at each step.\n", - "\n", - " - Pipeline Pattern: Raw HTML tags are processed in a sequential manner.\n", - " The steps follow an ordered, step-by-step approach, akin to a Finite\n", - " State Machine (FSM). Each element transitions through various states\n", - " defined by the sequence of processing steps.\n", - "\n", - " - Strategy Pattern: Each step is customizable. You can either replace,\n", - " remove, or extend any of the existing steps with your own or\n", - " inherited implementation. Alternatively, you can replace the entire pipeline\n", - " with your own process.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.processing_engine import AbstractSemanticElementParser\n", - "\n", - "print(AbstractSemanticElementParser.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Processing is organized in steps. If there are no steps, there will be no processing:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NotYetClassifiedElement (text: Financial ...)\n", - "NotYetClassifiedElement
(text: The financ...)\n", - "NotYetClassifiedElement
(text: Strategies...)\n" - ] - } - ], - "source": [ - "from sec_parser import Edgar10QParser\n", - "\n", - "\n", - "def get_steps():\n", - " return []\n", - "\n", - "\n", - "parser = Edgar10QParser(get_steps)\n", - "elements = parser.parse(html)\n", - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, it is exactly the same as just wrapping the tags with `UndeterminedElement`:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NotYetClassifiedElement (text: Financial ...)\n", - "NotYetClassifiedElement(text: The financ...)\n", - "NotYetClassifiedElement
(text: Strategies...)\n" - ] - } - ], - "source": [ - "show([NotYetClassifiedElement(tag) for tag in tags])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's create the first simple parsing step that naively identifies title and text tags." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MyClassifier: Successfully processed 3 tags!\n", - "\n" - ] - } - ], - "source": [ - "from sec_parser.processing_steps import AbstractProcessingStep\n", - "\n", - "\n", - "class MyClassifier(AbstractProcessingStep):\n", - " def __init__(self):\n", - " super().__init__()\n", - " # You can hold state in your processing steps\n", - " self.processed_tags_count = 0\n", - "\n", - " # This method must be implemented when inheriting from AbstractProcessingStep\n", - " def _process(self, elements):\n", - " parsed = []\n", - " for e in elements:\n", - " self.processed_tags_count += 1\n", - " if e.html_tag.name == \"b\":\n", - " parsed.append(TitleElement.create_from_element(e, \"\"))\n", - " elif e.html_tag.name == \"p\":\n", - " parsed.append(TextElement.create_from_element(e, \"\"))\n", - " else:\n", - " parsed.append(e)\n", - " print(\n", - " f\"MyClassifier: Successfully processed {self.processed_tags_count} tags!\\n\"\n", - " )\n", - " return parsed\n", - "\n", - "\n", - "def get_steps() -> list[AbstractProcessingStep]:\n", - " return [MyClassifier()]\n", - "\n", - "\n", - "parser = Edgar10QParser(get_steps)\n", - "elements = parser.parse(html)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TitleElement[L0] (text: Financial ...)\n", - "TextElement(text: The financ...)\n", - "NotYetClassifiedElement
(text: Strategies...)\n" - ] - } - ], - "source": [ - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The third tag cannot be identified as a single Semantic Element, let's see what can we do about it." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Handling Multiple Semantic Elements in a Single HTML Tag" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If multiple Semantic Elements are in the same HTML tag, we would first identify such cases by naming the element as `CompositeSemanticElement`." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " CompositeSemanticElement acts as a container for other semantic elements,\n", - " especially for cases where a single HTML root tag wraps multiple elements.\n", - " This ensures structural integrity and enables various features like\n", - " semantic segmentation visualization, and debugging by comparison with the\n", - " original document.\n", - "\n", - " Why is this useful:\n", - " ===================\n", - " 1. Some semantic elements, like XBRL tags (), may wrap multiple semantic\n", - " elements. The container ensures that these relationships are not broken\n", - " during parsing.\n", - " 2. Enables the parser to fully reconstruct the original HTML document, which\n", - " opens up possibilities for features like semantic segmentation visualization\n", - " (e.g. recreate the original document but put semi-transparent colored boxes\n", - " on top, based on semantic meaning), serialization of parsed documents into\n", - " an augmented HTML, and debugging by comparing to the original document.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.semantic_elements import CompositeSemanticElement\n", - "\n", - "print(CompositeSemanticElement.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "by creating a naive implementation of doing the identification:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NotYetClassifiedElement (text: Financial ...)\n", - "NotYetClassifiedElement (text: The financ...)\n", - "CompositeSemanticElement
(has 3 elements inside)\n" - ] - } - ], - "source": [ - "class CompositeElementIdentificationStep(AbstractProcessingStep):\n", - " def _process(self, elements):\n", - " result = []\n", - " for e in elements:\n", - " if e.html_tag.name == \"div\":\n", - " result.append(\n", - " CompositeSemanticElement.create_from_element(\n", - " e,\n", - " inner_elements=[\n", - " NotYetClassifiedElement(t)\n", - " for t in e.html_tag.get_children()\n", - " ],\n", - " log_origin=\"CompositeElementIdentificationStep\",\n", - " )\n", - " )\n", - " else:\n", - " result.append(e)\n", - " return result\n", - "\n", - "\n", - "parser = Edgar10QParser(lambda: [CompositeElementIdentificationStep()])\n", - "elements = parser.parse(html, unwrap_elements=False)\n", - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have successfully identified the tag as a `CompositeSemanticElement`.\n", - "\n", - "However, `CompositeSemanticElement` is intended for more advanced use cases, normally we won't even notice it (we had to set `unwrap_elements` flag to `False` to see it):" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NotYetClassifiedElement (text: Financial ...)\n", - "NotYetClassifiedElement(text: The financ...)\n", - "NotYetClassifiedElement (text: Strategies...)\n", - "NotYetClassifiedElement
(text: Investment...)\n", - "NotYetClassifiedElement\n" - ] - } - ], - "source": [ - "elements = parser.parse(html)\n", - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now combine the steps together. One steps output is another steps input, therefore order is important:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MyClassifier: Successfully processed 3 tags!\n", - "\n", - "TitleElement[L0] (text: Financial ...)\n", - "TextElement
(text: The financ...)\n", - "NotYetClassifiedElement (text: Strategies...)\n", - "NotYetClassifiedElement
(text: Investment...)\n", - "NotYetClassifiedElement\n" - ] - } - ], - "source": [ - "def get_steps():\n", - " return [\n", - " CompositeElementIdentificationStep(),\n", - " MyClassifier(),\n", - " ]\n", - "\n", - "\n", - "parser = Edgar10QParser(get_steps)\n", - "elements = parser.parse(html)\n", - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that the inner elements of `CompositeSemanticElement` did not get processed. This is because it requires special handling. A simple way to do it would be to inherit from `ElementwiseProcessingStep`:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "from sec_parser.processing_steps import AbstractElementwiseProcessingStep\n", - "\n", - "\n", - "class BetterClassifier(AbstractElementwiseProcessingStep):\n", - " def _process_element(self, element, context):\n", - " if element.html_tag.name == \"b\":\n", - " return TitleElement.create_from_element(element, \"\")\n", - " elif element.html_tag.name == \"p\":\n", - " return TextElement.create_from_element(element, \"\")\n", - " elif element.html_tag.name == \"img\":\n", - " return ImageElement.create_from_element(element, \"\")\n", - " return element" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TitleElement[L0] (text: Financial ...)\n", - "TextElement
(text: The financ...)\n", - "TitleElement[L0] (text: Strategies...)\n", - "TextElement
(text: Investment...)\n", - "ImageElement\n" - ] - } - ], - "source": [ - "def get_steps():\n", - " return [\n", - " CompositeElementIdentificationStep(),\n", - " BetterClassifier(),\n", - " ]\n", - "\n", - "\n", - "parser = Edgar10QParser(get_steps)\n", - "elements = parser.parse(html)\n", - "show(elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have completed the HTML parsing as the result looks the same as we intended:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TitleElement[L0] (text: Financial ...)\n", - "TextElement
(text: The financ...)\n", - "TitleElement[L0] (text: Strategies...)\n", - "TextElement
(text: Investment...)\n", - "ImageElement\n" - ] - } - ], - "source": [ - "show(expected_elements)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction to Semantic Trees" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " Builds a semantic tree from a list of semantic elements.\n", - "\n", - " Why Use a Tree Structure?\n", - " =========================\n", - " Using a tree data structure allows for easier and more robust filtering of sections.\n", - " With a tree, you can select specific branches to filter, making it straightforward\n", - " to identify section boundaries. This approach is more maintainable and robust\n", - " compared to attempting the same operations on a flat list of elements.\n", - "\n", - " Overview:\n", - " =========\n", - " 1. Takes a list of semantic elements.\n", - " 2. Applies nesting rules to these elements.\n", - "\n", - " Customization:\n", - " ==============\n", - " The nesting process is customizable through a list of rules. These rules determine\n", - " how new elements should be nested under existing ones.\n", - "\n", - " Advanced Customization:\n", - " =======================\n", - " You can supply your own set of rules by providing a callable to `get_rules`, which\n", - " should return a list of `AbstractNestingRule` instances.\n", - " \n" - ] - } - ], - "source": [ - "from sec_parser.semantic_tree import TreeBuilder\n", - "\n", - "print(TreeBuilder.__doc__)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Let's apply this to our example:***" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A very similar processing pattern is used here as well:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTitleElement\u001b[0m: Financial Overview\n", - "└── \u001b[1;34mTextElement\u001b[0m: The financial sector is a categ...ommercial and retail customers.\n", - "\u001b[1;34mTitleElement\u001b[0m: Strategies of Investment\n", - "├── \u001b[1;34mTextElement\u001b[0m: Investment strategies are plans...ign with their financial goals.\n", - "└── \u001b[1;34mImageElement\u001b[0m\n" - ] - } - ], - "source": [ - "from sec_parser.semantic_tree import AlwaysNestAsParentRule, AbstractNestingRule, render\n", - "\n", - "\n", - "def get_rules() -> list[AbstractNestingRule]:\n", - " return [\n", - " AlwaysNestAsParentRule(TitleElement),\n", - " ]\n", - "\n", - "\n", - "builder = TreeBuilder(get_rules)\n", - "tree = builder.build(elements)\n", - "print(render(list(tree)))" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTitleElement\u001b[0m: Financial Overview\n", - "└── \u001b[1;34mTextElement\u001b[0m: The financial sector is a categ...ommercial and retail customers.\n" - ] - } - ], - "source": [ - "print(render(list(tree)[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTitleElement\u001b[0m: Strategies of Investment\n", - "├── \u001b[1;34mTextElement\u001b[0m: Investment strategies are plans...ign with their financial goals.\n", - "└── \u001b[1;34mImageElement\u001b[0m\n" - ] - } - ], - "source": [ - "print(render(list(tree)[1]))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For further understanding of `sec-parser`, refer to the [**Documentation**](https://sec-parser.rtfd.io). If you're interested in contributing, consider checking out our [**Contribution Guide**](https://github.com/alphanome-ai/sec-parser/blob/main/CONTRIBUTING.md)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/source/notebooks/user_guide.ipynb b/docs/source/notebooks/user_guide.ipynb index 94bb3e2..83e9bc3 100644 --- a/docs/source/notebooks/user_guide.ipynb +++ b/docs/source/notebooks/user_guide.ipynb @@ -1,470 +1,470 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# This will auto-format your code. You can optionally install 'jupyter-black' using pip.\n", - "# Note: this cell is hidden from the HTML output. Read more: https://nbsphinx.readthedocs.io/en/0.2.1/hidden-cells.html\n", - "try:\n", - " import jupyter_black\n", - "\n", - " jupyter_black.load()\n", - "except ImportError:\n", - " pass" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# User Guide: Quick Start" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Welcome to the User Guide for `sec-parser`! This guide is designed to walk you through the fundamental steps needed to install and use the library for parsing SEC EDGAR HTML documents into semantic elements and trees. Whether you're a financial analyst, a data scientist, or someone interested in SEC filings, this guide provides examples and code snippets to help you get started.\n", - "\n", - "This guide is interactive, allowing you to engage with the code and concepts as you learn. You can run and modify all the code examples shown here for yourself by cloning the repository and running the [user_guide.ipynb](https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb) in a Jupyter notebook.\n", - "\n", - "Alternatively, you can also run the notebook directly in your browser using Google Colab:\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", - "[![My Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/alphanome-ai/sec-parser/main?filepath=docs/source/notebooks/user_guide.ipynb)\n", - "[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/kernels/welcome?src=https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", - "[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", - "\n", - "Let's get started!" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Getting Started" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This guide will walk you through the process of installing the `sec-parser` package and using it to extract the \"Segment Operating Performance\" section as a semantic tree from the latest Apple 10-Q filing." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Installation" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, install the `sec-parser` package using pip:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " import sec_parser\n", - "except ImportError:\n", - " !pip install -q sec-parser\n", - " import sec_parser" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to run the example code in this Guide, you'll also need the `sec_downloader` package:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "try:\n", - " import sec_downloader\n", - "except ImportError:\n", - " !pip install -q sec-downloader\n", - " import sec_downloader" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Usage" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you've installed the necessary packages, you can start by downloading the filing from the SEC EDGAR website. Here's how you can do it:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from sec_downloader import Downloader\n", - "\n", - "# Initialize the downloader with your company name and email\n", - "dl = Downloader(\"MyCompanyName\", \"email@example.com\")\n", - "\n", - "# Download the latest 10-Q filing for Apple\n", - "html = dl.get_filing_html(ticker=\"AAPL\", form=\"10-Q\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> [!NOTE]\n", - "The company name and email address are used to form a user-agent string that adheres to the SEC EDGAR's fair access policy for programmatic downloading. [Source](https://www.sec.gov/os/webmaster-faq#code-support)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can parse the filing HTML into a list of semantic elements:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Utility function to make the example code a bit more compact\n", - "def print_first_n_lines(text: str, *, n: int):\n", - " print(\"\\n\".join(text.split(\"\\n\")[:n]), \"...\", sep=\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTopLevelSectionTitle\u001b[0m: PART I — FINANCIAL INFORMATION\n", - "\u001b[1;34mTopLevelSectionTitle\u001b[0m: Item 1. Financial Statements\n", - "\u001b[1;34mTitleElement\u001b[0m: Apple Inc.\n", - "\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n", - "\u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...housands and per share amounts)\n", - "\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n", - "\u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n", - "...\n" - ] - } - ], - "source": [ - "import sec_parser as sp\n", - "\n", - "elements: list = sp.Edgar10QParser().parse(html)\n", - "\n", - "demo_output: str = sp.render(elements)\n", - "print_first_n_lines(demo_output, n=7)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also construct a semantic tree to allow for easy filtering by parent sections:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTopLevelSectionTitle\u001b[0m: PART I — FINANCIAL INFORMATION\n", - "├── \u001b[1;34mTopLevelSectionTitle\u001b[0m: Item 1. Financial Statements\n", - "│ ├── \u001b[1;34mTitleElement\u001b[0m: Apple Inc.\n", - "│ │ └── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n", - "│ │ ├── \u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...housands and per share amounts)\n", - "│ │ ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n", - "│ │ ├── \u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n", - "...\n" - ] - } - ], - "source": [ - "tree = sp.TreeBuilder().build(elements)\n", - "\n", - "demo_output: str = sp.render(tree)\n", - "print_first_n_lines(demo_output, n=7)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Advanced Usage" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Processing is organized in steps. You can modify, add, remove steps as needed. Each step is a function that takes a list of elements as input and returns a list of elements as output. The output of one step is the input of the next step." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Step 1: IndividualSemanticElementExtractor\n", - "Step 2: ImageClassifier\n", - "Step 3: EmptyElementClassifier\n", - "Step 4: TableClassifier\n", - "Step 5: TableOfContentsClassifier\n", - "Step 6: TopLevelSectionManagerFor10Q\n", - "Step 7: IntroductorySectionElementClassifier\n", - "Step 8: TextClassifier\n", - "Step 9: HighlightedTextClassifier\n", - "Step 10: SupplementaryTextClassifier\n", - "Step 11: TitleClassifier\n", - "Step 12: TextElementMerger\n" - ] - } - ], - "source": [ - "steps = sp.Edgar10QParser().get_default_steps()\n", - "\n", - "for i, step in enumerate(steps, 1):\n", - " print(f\"Step {i}: {step.__class__.__name__}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's illustrate an example where we replace the text element classifier with our custom classifier. This custom classifier is designed to identify, which elements match our custom element description:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Step 1: IndividualSemanticElementExtractor\n", - "Step 2: ImageClassifier\n", - "Step 3: EmptyElementClassifier\n", - "Step 4: TableClassifier\n", - "Step 5: TableOfContentsClassifier\n", - "Step 6: TopLevelSectionManagerFor10Q\n", - "Step 7: IntroductorySectionElementClassifier\n", - "Step 8: MyClassifier\n", - "Step 9: HighlightedTextClassifier\n", - "Step 10: SupplementaryTextClassifier\n", - "Step 11: TitleClassifier\n", - "Step 12: TextElementMerger\n" - ] - } - ], - "source": [ - "from sec_parser.processing_steps import TextClassifier\n", - "\n", - "\n", - "# Create a custom element class\n", - "class MyElement(sp.TextElement):\n", - " pass\n", - "\n", - "\n", - "# Create a custom parsing step\n", - "class MyClassifier(TextClassifier):\n", - " def _process_element(self, element, context):\n", - " if element.text != \"\":\n", - " return MyElement.create_from_element(element, log_origin=\"MyClassifier\")\n", - "\n", - " # Let the parent class handle the other cases\n", - " return super()._process_element(element, context)\n", - "\n", - "\n", - "# Replace the default text parsing step with our custom one\n", - "steps = [MyClassifier() if isinstance(step, TextClassifier) else step for step in steps]\n", - "for i, step in enumerate(steps, 1):\n", - " print(f\"Step {i}: {step.__class__.__name__}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As demonstrated above, our custom classifier is now integrated into the pipeline. \n", - "\n", - "There's an additional caveat to consider. Without specifying an \"allowlist\" of types, TableElement will be classified as TextElement, as it contains text. To prevent this, we will process only `NotYetClassifiedElement` types and bypass processing for all other types.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;34mTitleElement\u001b[0m: Segment Operating Performance\n", - "├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s...25, 2022 (dollars in millions):\n", - "├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~40 numbers, and 414 characters.\n", - "├── \u001b[1;34mTitleElement\u001b[0m: Americas\n", - "│ └── \u001b[1;34mMyElement\u001b[0m: Americas net sales decreased du...y higher net sales of Services.\n", - "├── \u001b[1;34mTitleElement\u001b[0m: Europe\n", - "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in foreign currenc... by higher net sales of iPhone.\n", - "├── \u001b[1;34mTitleElement\u001b[0m: Greater China\n", - "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in the renminbi re...y of lower net sales of iPhone.\n", - "├── \u001b[1;34mTitleElement\u001b[0m: Japan\n", - "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in the yen relativ...earables, Home and Accessories.\n", - "└── \u001b[1;34mTitleElement\u001b[0m: Rest of Asia Pacific\n", - " └── \u001b[1;34mMyElement\u001b[0m: The weakness in foreign currenc...fset by lower net sales of Mac.\n", - "...\n" - ] - } - ], - "source": [ - "def get_steps():\n", - " return [\n", - " MyClassifier(types_to_process={sp.NotYetClassifiedElement})\n", - " if isinstance(step, TextClassifier)\n", - " else step\n", - " for step in sp.Edgar10QParser().get_default_steps()\n", - " ]\n", - "\n", - "\n", - "elements = sp.Edgar10QParser(get_steps).parse(html)\n", - "tree = sp.TreeBuilder().build(elements)\n", - "section = [n for n in tree.nodes if n.text.startswith(\"Segment\")][0]\n", - "print(\"\\n\".join(sp.render(section).split(\"\\n\")[:13]), \"...\", sep=\"\\n\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more examples and advanced usage, you can continue learning how to use `sec-parser` by referring to the [**Developer Guide**](https://sec-parser.readthedocs.io/en/latest/notebooks/developer_guide.html) and [**Documentation**](https://sec-parser.rtfd.io). If you're interested in contributing, consider checking out our [**Contribution Guide**](https://github.com/alphanome-ai/sec-parser/blob/main/CONTRIBUTING.md)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What's Next?\n", - "\n", - "You've successfully parsed an SEC document into semantic elements and arranged them into a tree structure. To further analyze this data with analytics or AI, you can use any tool of your choice.\n", - "\n", - "For a tailored experience, consider using our free and open-source library for AI-powered financial analysis: \n", - "\n", - "[**Explore sec-ai on GitHub**](https://github.com/alphanome-ai/sec-ai)\n", - "\n", - "```bash\n", - "pip install sec-ai\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# This will auto-format your code. You can optionally install 'jupyter-black' using pip.\n", + "# Note: this cell is hidden from the HTML output. Read more: https://nbsphinx.readthedocs.io/en/0.2.1/hidden-cells.html\n", + "try:\n", + " import jupyter_black\n", + "\n", + " jupyter_black.load()\n", + "except ImportError:\n", + " pass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# User Guide: Quick Start" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Welcome to the User Guide for `sec-parser`! This guide is designed to walk you through the fundamental steps needed to install and use the library for parsing SEC EDGAR HTML documents into semantic elements and trees. Whether you're a financial analyst, a data scientist, or someone interested in SEC filings, this guide provides examples and code snippets to help you get started.\n", + "\n", + "This guide is interactive, allowing you to engage with the code and concepts as you learn. You can run and modify all the code examples shown here for yourself by cloning the repository and running the [user_guide.ipynb](https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb) in a Jupyter notebook.\n", + "\n", + "Alternatively, you can also run the notebook directly in your browser using Google Colab:\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", + "[![My Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/alphanome-ai/sec-parser/main?filepath=docs/source/notebooks/user_guide.ipynb)\n", + "[![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/kernels/welcome?src=https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", + "[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/https://github.com/alphanome-ai/sec-parser/blob/main/docs/source/notebooks/user_guide.ipynb)\n", + "\n", + "Let's get started!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Started" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This guide will walk you through the process of installing the `sec-parser` package and using it to extract the \"Segment Operating Performance\" section as a semantic tree from the latest Apple 10-Q filing." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installation" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, install the `sec-parser` package using pip:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import sec_parser\n", + "except ImportError:\n", + " !pip install -q sec-parser\n", + " import sec_parser" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to run the example code in this Guide, you'll also need the `sec_downloader` package:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "try:\n", + " import sec_downloader\n", + "except ImportError:\n", + " !pip install -q sec-downloader\n", + " import sec_downloader" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Usage" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you've installed the necessary packages, you can start by downloading the filing from the SEC EDGAR website. Here's how you can do it:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sec_downloader import Downloader\n", + "\n", + "# Initialize the downloader with your company name and email\n", + "dl = Downloader(\"MyCompanyName\", \"email@example.com\")\n", + "\n", + "# Download the latest 10-Q filing for Apple\n", + "html = dl.get_filing_html(ticker=\"AAPL\", form=\"10-Q\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> [!NOTE]\n", + "The company name and email address are used to form a user-agent string that adheres to the SEC EDGAR's fair access policy for programmatic downloading. [Source](https://www.sec.gov/os/webmaster-faq#code-support)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can parse the filing HTML into a list of semantic elements:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Utility function to make the example code a bit more compact\n", + "def print_first_n_lines(text: str, *, n: int):\n", + " print(\"\\n\".join(text.split(\"\\n\")[:n]), \"...\", sep=\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTopSectionTitle\u001b[0m: PART I — FINANCIAL INFORMATION\n", + "\u001b[1;34mTopSectionTitle\u001b[0m: Item 1. Financial Statements\n", + "\u001b[1;34mTitleElement\u001b[0m: Apple Inc.\n", + "\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n", + "\u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...housands and per share amounts)\n", + "\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n", + "\u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n", + "...\n" + ] + } + ], + "source": [ + "import sec_parser as sp\n", + "\n", + "elements: list = sp.Edgar10QParser().parse(html)\n", + "\n", + "demo_output: str = sp.render(elements)\n", + "print_first_n_lines(demo_output, n=7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also construct a semantic tree to allow for easy filtering by parent sections:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTopSectionTitle\u001b[0m: PART I — FINANCIAL INFORMATION\n", + "├── \u001b[1;34mTopSectionTitle\u001b[0m: Item 1. Financial Statements\n", + "│ ├── \u001b[1;34mTitleElement\u001b[0m: Apple Inc.\n", + "│ │ └── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n", + "│ │ ├── \u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...housands and per share amounts)\n", + "│ │ ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n", + "│ │ ├── \u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n", + "...\n" + ] + } + ], + "source": [ + "tree = sp.TreeBuilder().build(elements)\n", + "\n", + "demo_output: str = sp.render(tree)\n", + "print_first_n_lines(demo_output, n=7)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Advanced Usage" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Processing is organized in steps. You can modify, add, remove steps as needed. Each step is a function that takes a list of elements as input and returns a list of elements as output. The output of one step is the input of the next step." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step 1: IndividualSemanticElementExtractor\n", + "Step 2: ImageClassifier\n", + "Step 3: EmptyElementClassifier\n", + "Step 4: TableClassifier\n", + "Step 5: TableOfContentsClassifier\n", + "Step 6: TopSectionManagerFor10Q\n", + "Step 7: IntroductorySectionElementClassifier\n", + "Step 8: TextClassifier\n", + "Step 9: HighlightedTextClassifier\n", + "Step 10: SupplementaryTextClassifier\n", + "Step 11: TitleClassifier\n", + "Step 12: TextElementMerger\n" + ] + } + ], + "source": [ + "steps = sp.Edgar10QParser().get_default_steps()\n", + "\n", + "for i, step in enumerate(steps, 1):\n", + " print(f\"Step {i}: {step.__class__.__name__}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's illustrate an example where we replace the text element classifier with our custom classifier. This custom classifier is designed to identify, which elements match our custom element description:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step 1: IndividualSemanticElementExtractor\n", + "Step 2: ImageClassifier\n", + "Step 3: EmptyElementClassifier\n", + "Step 4: TableClassifier\n", + "Step 5: TableOfContentsClassifier\n", + "Step 6: TopSectionManagerFor10Q\n", + "Step 7: IntroductorySectionElementClassifier\n", + "Step 8: MyClassifier\n", + "Step 9: HighlightedTextClassifier\n", + "Step 10: SupplementaryTextClassifier\n", + "Step 11: TitleClassifier\n", + "Step 12: TextElementMerger\n" + ] + } + ], + "source": [ + "from sec_parser.processing_steps import TextClassifier\n", + "\n", + "\n", + "# Create a custom element class\n", + "class MyElement(sp.TextElement):\n", + " pass\n", + "\n", + "\n", + "# Create a custom parsing step\n", + "class MyClassifier(TextClassifier):\n", + " def _process_element(self, element, context):\n", + " if element.text != \"\":\n", + " return MyElement.create_from_element(element, log_origin=\"MyClassifier\")\n", + "\n", + " # Let the parent class handle the other cases\n", + " return super()._process_element(element, context)\n", + "\n", + "\n", + "# Replace the default text parsing step with our custom one\n", + "steps = [MyClassifier() if isinstance(step, TextClassifier) else step for step in steps]\n", + "for i, step in enumerate(steps, 1):\n", + " print(f\"Step {i}: {step.__class__.__name__}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As demonstrated above, our custom classifier is now integrated into the pipeline. \n", + "\n", + "There's an additional caveat to consider. Without specifying an \"allowlist\" of types, TableElement will be classified as TextElement, as it contains text. To prevent this, we will process only `NotYetClassifiedElement` types and bypass processing for all other types.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;34mTitleElement\u001b[0m: Segment Operating Performance\n", + "├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s...25, 2022 (dollars in millions):\n", + "├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~40 numbers, and 414 characters.\n", + "├── \u001b[1;34mTitleElement\u001b[0m: Americas\n", + "│ └── \u001b[1;34mMyElement\u001b[0m: Americas net sales decreased du...y higher net sales of Services.\n", + "├── \u001b[1;34mTitleElement\u001b[0m: Europe\n", + "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in foreign currenc... by higher net sales of iPhone.\n", + "├── \u001b[1;34mTitleElement\u001b[0m: Greater China\n", + "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in the renminbi re...y of lower net sales of iPhone.\n", + "├── \u001b[1;34mTitleElement\u001b[0m: Japan\n", + "│ └── \u001b[1;34mMyElement\u001b[0m: The weakness in the yen relativ...earables, Home and Accessories.\n", + "└── \u001b[1;34mTitleElement\u001b[0m: Rest of Asia Pacific\n", + " └── \u001b[1;34mMyElement\u001b[0m: The weakness in foreign currenc...fset by lower net sales of Mac.\n", + "...\n" + ] + } + ], + "source": [ + "def get_steps():\n", + " return [\n", + " MyClassifier(types_to_process={sp.NotYetClassifiedElement})\n", + " if isinstance(step, TextClassifier)\n", + " else step\n", + " for step in sp.Edgar10QParser().get_default_steps()\n", + " ]\n", + "\n", + "\n", + "elements = sp.Edgar10QParser(get_steps).parse(html)\n", + "tree = sp.TreeBuilder().build(elements)\n", + "section = [n for n in tree.nodes if n.text.startswith(\"Segment\")][0]\n", + "print(\"\\n\".join(sp.render(section).split(\"\\n\")[:13]), \"...\", sep=\"\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more examples and advanced usage, you can continue learning how to use `sec-parser` by referring to the [**Developer Guide**](https://sec-parser.readthedocs.io/en/latest/notebooks/developer_guide.html) and [**Documentation**](https://sec-parser.rtfd.io). If you're interested in contributing, consider checking out our [**Contribution Guide**](https://github.com/alphanome-ai/sec-parser/blob/main/CONTRIBUTING.md)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What's Next?\n", + "\n", + "You've successfully parsed an SEC document into semantic elements and arranged them into a tree structure. To further analyze this data with analytics or AI, you can use any tool of your choice.\n", + "\n", + "For a tailored experience, consider using our free and open-source library for AI-powered financial analysis: \n", + "\n", + "[**Explore sec-ai on GitHub**](https://github.com/alphanome-ai/sec-ai)\n", + "\n", + "```bash\n", + "pip install sec-ai\n", + "```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/sec_parser/__init__.py b/sec_parser/__init__.py index d7802e3..fb21ef0 100644 --- a/sec_parser/__init__.py +++ b/sec_parser/__init__.py @@ -25,7 +25,7 @@ ) from sec_parser.semantic_elements.table_element.table_element import TableElement from sec_parser.semantic_elements.title_element import TitleElement -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle +from sec_parser.semantic_elements.top_section_title import TopSectionTitle from sec_parser.semantic_tree.nesting_rules import AbstractNestingRule from sec_parser.semantic_tree.render_ import render from sec_parser.semantic_tree.semantic_tree import SemanticTree @@ -40,7 +40,7 @@ "AbstractSemanticElement", "CompositeSemanticElement", "NotYetClassifiedElement", - "TopLevelSectionTitle", + "TopSectionTitle", "TextElement", "TitleElement", "IrrelevantElement", diff --git a/sec_parser/processing_engine/core.py b/sec_parser/processing_engine/core.py index ddbc111..06c4ba2 100644 --- a/sec_parser/processing_engine/core.py +++ b/sec_parser/processing_engine/core.py @@ -22,8 +22,8 @@ from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.table_check import ( TableCheck, ) -from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_level_section_title_check import ( - TopLevelSectionTitleCheck, +from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_section_title_check import ( + TopSectionTitleCheck, ) from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.xbrl_tag_check import ( XbrlTagCheck, @@ -43,8 +43,8 @@ from sec_parser.processing_steps.text_classifier import TextClassifier from sec_parser.processing_steps.text_element_merger import TextElementMerger from sec_parser.processing_steps.title_classifier import TitleClassifier -from sec_parser.processing_steps.top_level_section_manager_for_10q import ( - TopLevelSectionManagerFor10Q, +from sec_parser.processing_steps.top_section_manager_for_10q import ( + TopSectionManagerFor10Q, ) from sec_parser.semantic_elements.composite_semantic_element import ( CompositeSemanticElement, @@ -184,7 +184,7 @@ def get_default_steps( EmptyElementClassifier(types_to_process={NotYetClassifiedElement}), TableClassifier(types_to_process={NotYetClassifiedElement}), TableOfContentsClassifier(types_to_process={TableElement}), - TopLevelSectionManagerFor10Q(types_to_process={NotYetClassifiedElement}), + TopSectionManagerFor10Q(types_to_process={NotYetClassifiedElement}), IntroductorySectionElementClassifier(), TextClassifier(types_to_process={NotYetClassifiedElement}), HighlightedTextClassifier(types_to_process={TextElement}), @@ -206,5 +206,5 @@ def get_default_single_element_checks(self) -> list[AbstractSingleElementCheck]: TableCheck(), XbrlTagCheck(), ImageCheck(), - TopLevelSectionTitleCheck(), + TopSectionTitleCheck(), ] diff --git a/sec_parser/processing_steps/__init__.py b/sec_parser/processing_steps/__init__.py index 75deab0..4e49ce3 100644 --- a/sec_parser/processing_steps/__init__.py +++ b/sec_parser/processing_steps/__init__.py @@ -22,8 +22,8 @@ from sec_parser.processing_steps.table_classifier import TableClassifier from sec_parser.processing_steps.text_classifier import TextClassifier from sec_parser.processing_steps.title_classifier import TitleClassifier -from sec_parser.processing_steps.top_level_section_manager_for_10q import ( - TopLevelSectionManagerFor10Q, +from sec_parser.processing_steps.top_section_manager_for_10q import ( + TopSectionManagerFor10Q, ) __all__ = [ @@ -36,5 +36,5 @@ "IndividualSemanticElementExtractor", "SupplementaryTextClassifier", "EmptyElementClassifier", - "TopLevelSectionManagerFor10Q", + "TopSectionManagerFor10Q", ] diff --git a/sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_level_section_title_check.py b/sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py similarity index 69% rename from sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_level_section_title_check.py rename to sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py index 6ddf788..4e48bcc 100644 --- a/sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_level_section_title_check.py +++ b/sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py @@ -5,8 +5,8 @@ from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( AbstractSingleElementCheck, ) -from sec_parser.processing_steps.top_level_section_manager_for_10q import ( - TopLevelSectionManagerFor10Q, +from sec_parser.processing_steps.top_section_manager_for_10q import ( + TopSectionManagerFor10Q, ) if TYPE_CHECKING: # pragma: no cover @@ -16,13 +16,14 @@ def _is_match_part_or_item(text: str) -> bool: - return TopLevelSectionManagerFor10Q.is_match_part_or_item(text) + return TopSectionManagerFor10Q.is_match_part_or_item(text) -class TopLevelSectionTitleCheck(AbstractSingleElementCheck): +class TopSectionTitleCheck(AbstractSingleElementCheck): def contains_single_element(self, element: AbstractSemanticElement) -> bool | None: match_count = element.html_tag.count_text_matches_in_descendants( - _is_match_part_or_item, exclude_links=True, + _is_match_part_or_item, + exclude_links=True, ) if match_count >= 1: diff --git a/sec_parser/processing_steps/introductory_section_classifier.py b/sec_parser/processing_steps/introductory_section_classifier.py index 83c30cb..afca519 100644 --- a/sec_parser/processing_steps/introductory_section_classifier.py +++ b/sec_parser/processing_steps/introductory_section_classifier.py @@ -7,7 +7,7 @@ ElementProcessingContext, ) from sec_parser.semantic_elements.semantic_elements import IntroductorySectionElement -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle +from sec_parser.semantic_elements.top_section_title import TopSectionTitle if TYPE_CHECKING: # pragma: no cover from sec_parser.semantic_elements.abstract_semantic_element import ( @@ -47,14 +47,14 @@ def _process_element( ) -> AbstractSemanticElement: if context.iteration == 0: if ( - isinstance(element, TopLevelSectionTitle) + isinstance(element, TopSectionTitle) and element.section_type.identifier == "part1" ): self._part1_exists = True return element if context.iteration == 1 and self._part1_exists: if ( - isinstance(element, TopLevelSectionTitle) + isinstance(element, TopSectionTitle) and element.section_type.identifier == "part1" ): self._part1_found = True diff --git a/sec_parser/processing_steps/top_level_section_manager_for_10q.py b/sec_parser/processing_steps/top_section_manager_for_10q.py similarity index 89% rename from sec_parser/processing_steps/top_level_section_manager_for_10q.py rename to sec_parser/processing_steps/top_section_manager_for_10q.py index a420b3f..4e5c283 100644 --- a/sec_parser/processing_steps/top_level_section_manager_for_10q.py +++ b/sec_parser/processing_steps/top_section_manager_for_10q.py @@ -10,11 +10,11 @@ AbstractElementwiseProcessingStep, ElementProcessingContext, ) -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle -from sec_parser.semantic_elements.top_level_section_title_types import ( +from sec_parser.semantic_elements.top_section_title import TopSectionTitle +from sec_parser.semantic_elements.top_section_title_types import ( IDENTIFIER_TO_10Q_SECTION, - InvalidTopLevelSectionIn10Q, - TopLevelSectionType, + InvalidTopSectionIn10Q, + TopSectionType, ) if TYPE_CHECKING: # pragma: no cover @@ -29,11 +29,11 @@ @dataclass class _Candidate: - section_type: TopLevelSectionType + section_type: TopSectionType element: AbstractSemanticElement -class TopLevelSectionManagerFor10Q(AbstractElementwiseProcessingStep): +class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep): """ Documents are divided into sections, subsections, and so on. Top level sections are the highest level of sections and are @@ -91,11 +91,11 @@ def _process_element( self._last_part = part section_type = IDENTIFIER_TO_10Q_SECTION.get( f"part{self._last_part}", - InvalidTopLevelSectionIn10Q, + InvalidTopSectionIn10Q, ) - if section_type is InvalidTopLevelSectionIn10Q: + if section_type is InvalidTopSectionIn10Q: warnings.warn( - f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopLevelSectionIn10Q.", + f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopSectionIn10Q.", UserWarning, stacklevel=8, ) @@ -103,11 +103,11 @@ def _process_element( elif item := self.match_item(element.text): section_type = IDENTIFIER_TO_10Q_SECTION.get( f"part{self._last_part}item{item}", - InvalidTopLevelSectionIn10Q, + InvalidTopSectionIn10Q, ) - if section_type is InvalidTopLevelSectionIn10Q: + if section_type is InvalidTopSectionIn10Q: warnings.warn( - f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopLevelSectionIn10Q.", + f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopSectionIn10Q.", UserWarning, stacklevel=8, ) @@ -123,7 +123,7 @@ def _process_element( if context.iteration == 1: if self._selected_candidates is None: grouped_candidates: dict[ - TopLevelSectionType, + TopSectionType, list[AbstractSemanticElement], ] = defaultdict(list) for candidate in self._candidates: @@ -170,7 +170,7 @@ def select_element( log_origin=self.__class__.__name__, ) continue - return TopLevelSectionTitle.create_from_element( + return TopSectionTitle.create_from_element( candidate.element, level=candidate.section_type.level, section_type=candidate.section_type, diff --git a/sec_parser/semantic_elements/__init__.py b/sec_parser/semantic_elements/__init__.py index 7d5856f..0dd651f 100644 --- a/sec_parser/semantic_elements/__init__.py +++ b/sec_parser/semantic_elements/__init__.py @@ -26,13 +26,13 @@ ) from sec_parser.semantic_elements.table_element.table_element import TableElement from sec_parser.semantic_elements.title_element import TitleElement -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle +from sec_parser.semantic_elements.top_section_title import TopSectionTitle __all__ = [ "AbstractSemanticElement", "AbstractLevelElement", "NotYetClassifiedElement", - "TopLevelSectionTitle", + "TopSectionTitle", "TextElement", "TitleElement", "InvalidLevelError", diff --git a/sec_parser/semantic_elements/abstract_semantic_element.py b/sec_parser/semantic_elements/abstract_semantic_element.py index e3de549..0793eb4 100644 --- a/sec_parser/semantic_elements/abstract_semantic_element.py +++ b/sec_parser/semantic_elements/abstract_semantic_element.py @@ -35,10 +35,12 @@ def __init__( self.processing_log = processing_log or ProcessingLog() # If creating derived classes that override __init__, make sure to call this - # in the derived class's __init__ method. Pass log_origin=None to the base class. + # at the very end the derived class's __init__ method. Pass log_origin=None to the base class. + # This is to ensure that the log_init is called at the very end of the __init__ self.log_init(log_origin) def log_init(self, log_origin: LogItemOrigin | None = None) -> None: + """Has to be called at the very end of the __init__ method.""" if log_origin: self.processing_log.add_item( log_origin=log_origin, @@ -137,7 +139,7 @@ def __init__( msg = f"Level must be equal or greater than {self.MIN_LEVEL}" raise InvalidLevelError(msg) self.level = level - self.log_init(log_origin) + self.log_init(log_origin) # Has to be called at the very end @classmethod def create_from_element( diff --git a/sec_parser/semantic_elements/top_level_section_title.py b/sec_parser/semantic_elements/top_section_start_marker.py similarity index 71% rename from sec_parser/semantic_elements/top_level_section_title.py rename to sec_parser/semantic_elements/top_section_start_marker.py index 0f5f647..8c52a88 100644 --- a/sec_parser/semantic_elements/top_level_section_title.py +++ b/sec_parser/semantic_elements/top_section_start_marker.py @@ -6,24 +6,18 @@ AbstractLevelElement, AbstractSemanticElement, ) -from sec_parser.semantic_elements.mixins.dict_text_content_mixin import ( - DictTextContentMixin, -) if TYPE_CHECKING: # pragma: no cover from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog - from sec_parser.semantic_elements.top_level_section_title_types import ( - TopLevelSectionType, - ) + from sec_parser.semantic_elements.top_section_title_types import TopSectionType -class TopLevelSectionTitle(DictTextContentMixin, AbstractLevelElement): +class TopSectionStartMarker(AbstractLevelElement): """ - The TopLevelSectionTitle class represents the title and the beginning of a top-level - section of a document. For instance, in SEC 10-Q reports, a - top-level section could be "Part I, Item 3. Quantitative and Qualitative - Disclosures About Market Risk.". + The TopSectionStartMarker class represents the beginning of a top-level + section of a document. It is used to mark the start of sections such as + "Part I, Item 1. Business" in SEC 10-Q reports. """ def __init__( @@ -33,13 +27,13 @@ def __init__( processing_log: ProcessingLog | None = None, log_origin: LogItemOrigin | None = None, level: int | None = None, - section_type: TopLevelSectionType | None = None, + section_type: TopSectionType | None = None, ) -> None: super().__init__( html_tag, processing_log=processing_log, level=level, - log_origin=None, + log_origin=None, # None because log_init has to be called at the very end ) if section_type is None: msg = "section_type cannot be None" @@ -54,7 +48,7 @@ def create_from_element( log_origin: LogItemOrigin, *, level: int | None = None, - section_type: TopLevelSectionType | None = None, + section_type: TopSectionType | None = None, ) -> AbstractLevelElement: return cls( source._html_tag, # noqa: SLF001 diff --git a/sec_parser/semantic_elements/top_section_title.py b/sec_parser/semantic_elements/top_section_title.py new file mode 100644 index 0000000..ded1d99 --- /dev/null +++ b/sec_parser/semantic_elements/top_section_title.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from sec_parser.semantic_elements.mixins.dict_text_content_mixin import ( + DictTextContentMixin, +) +from sec_parser.semantic_elements.top_section_start_marker import TopSectionStartMarker + +if TYPE_CHECKING: # pragma: no cover + from sec_parser.processing_engine.html_tag import HtmlTag + from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog + from sec_parser.semantic_elements.abstract_semantic_element import ( + AbstractLevelElement, + AbstractSemanticElement, + ) + from sec_parser.semantic_elements.top_section_title_types import TopSectionType + + +class TopSectionTitle(DictTextContentMixin, TopSectionStartMarker): + """ + The TopSectionTitle class represents the title and the beginning of a top-level + section of a document. For instance, in SEC 10-Q reports, a + top-level section could be "Part I, Item 3. Quantitative and Qualitative + Disclosures About Market Risk.". + """ + + def __init__( + self, + html_tag: HtmlTag, + *, + processing_log: ProcessingLog | None = None, + log_origin: LogItemOrigin | None = None, + level: int | None = None, + section_type: TopSectionType | None = None, + ) -> None: + super().__init__( + html_tag, + processing_log=processing_log, + level=level, + log_origin=None, # None because log_init has to be called at the very end + section_type=section_type, + ) + self.log_init(log_origin) # log_init has to be called at the very end + + @classmethod + def create_from_element( + cls, + source: AbstractSemanticElement, + log_origin: LogItemOrigin, + *, + level: int | None = None, + section_type: TopSectionType | None = None, + ) -> AbstractLevelElement: + return cls( + source._html_tag, # noqa: SLF001 + level=level, + section_type=section_type, + processing_log=source.processing_log, + log_origin=log_origin, + ) + + def to_dict( + self, + *, + include_previews: bool = False, + include_contents: bool = False, + ) -> dict[str, Any]: + return { + **super().to_dict( + include_previews=include_previews, + include_contents=include_contents, + ), + "section_type": self.section_type.identifier, + } diff --git a/sec_parser/semantic_elements/top_level_section_title_types.py b/sec_parser/semantic_elements/top_section_title_types.py similarity index 80% rename from sec_parser/semantic_elements/top_level_section_title_types.py rename to sec_parser/semantic_elements/top_section_title_types.py index f392726..b4278a5 100644 --- a/sec_parser/semantic_elements/top_level_section_title_types.py +++ b/sec_parser/semantic_elements/top_section_title_types.py @@ -2,14 +2,14 @@ @dataclass(frozen=True) -class TopLevelSectionIn10Q: +class TopSectionIn10Q: identifier: str title: str order: int level: int = 0 -InvalidTopLevelSectionIn10Q = TopLevelSectionIn10Q( +InvalidTopSectionIn10Q = TopSectionIn10Q( identifier="invalid", title="Invalid", order=-1, @@ -17,79 +17,79 @@ class TopLevelSectionIn10Q: ) ALL_10Q_SECTIONS = ( - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part1", title="Financial Information", order=0, level=0, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part1item1", title="Financial Statements", order=1, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part1item2", title="Management's Discussion and Analysis of Financial Condition and Results of Operations", order=2, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part1item3", title="Quantitative and Qualitative Disclosures About Market Risk", order=3, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part1item4", title="Controls and Procedures", order=4, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2", title="Other Information", order=5, level=0, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item1", title="Legal Proceedings", order=6, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item1a", title="Risk Factors", order=7, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item2", title="Unregistered Sales of Equity Securities and Use of Proceeds", order=8, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item3", title="Defaults Upon Senior Securities", order=9, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item4", title="Mine Safety Disclosures", order=10, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item5", title="Other Information", order=11, level=1, ), - TopLevelSectionIn10Q( + TopSectionIn10Q( identifier="part2item6", title="Exhibits", order=12, @@ -102,4 +102,4 @@ class TopLevelSectionIn10Q: } -TopLevelSectionType = TopLevelSectionIn10Q +TopSectionType = TopSectionIn10Q diff --git a/sec_parser/semantic_tree/tree_builder.py b/sec_parser/semantic_tree/tree_builder.py index 34d185d..5705b42 100644 --- a/sec_parser/semantic_tree/tree_builder.py +++ b/sec_parser/semantic_tree/tree_builder.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Callable from sec_parser.semantic_elements.title_element import TitleElement -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle +from sec_parser.semantic_elements.top_section_start_marker import TopSectionStartMarker from sec_parser.semantic_tree.nesting_rules import ( AbstractNestingRule, AlwaysNestAsParentRule, @@ -54,14 +54,10 @@ def __init__( @staticmethod def get_default_rules() -> list[AbstractNestingRule]: return [ - AlwaysNestAsParentRule(TopLevelSectionTitle), - # Both RootSectionRule and TitleRule nest all elements under them, - # leading to a conflict where a decision between TopLevelSectionStartMarker - # and TitleRule is needed. This conflict is resolved by excluding - # TopLevelSectionStartMarker from the rule for TitleElements. + AlwaysNestAsParentRule(TopSectionStartMarker), AlwaysNestAsParentRule( TitleElement, - exclude_children={TopLevelSectionTitle}, + exclude_children={TopSectionStartMarker}, ), NestSameTypeDependingOnLevelRule(), ] diff --git a/tests/exploratory/processing_steps/top_level_section_title_classifier/test_top_level_section_title_classifier.py b/tests/exploratory/processing_steps/top_level_section_title_classifier/test_top_level_section_title_classifier.py index 8b41413..e805d7c 100644 --- a/tests/exploratory/processing_steps/top_level_section_title_classifier/test_top_level_section_title_classifier.py +++ b/tests/exploratory/processing_steps/top_level_section_title_classifier/test_top_level_section_title_classifier.py @@ -4,7 +4,7 @@ import pytest -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle +from sec_parser.semantic_elements.top_section_title import TopSectionTitle from tests.types import ParsedDocumentComponents, Report from tests.utils import all_reports, load_yaml_filter @@ -35,9 +35,7 @@ def test_top_level_section_title_classifier( parsed_report = parse(report) semantic_elements = parsed_report.semantic_elements sections = [ - element - for element in semantic_elements - if isinstance(element, TopLevelSectionTitle) + element for element in semantic_elements if isinstance(element, TopSectionTitle) ] # Sanity checks diff --git a/tests/snapshot/manage_snapshots.py b/tests/snapshot/manage_snapshots.py index 738eed2..549cba8 100644 --- a/tests/snapshot/manage_snapshots.py +++ b/tests/snapshot/manage_snapshots.py @@ -162,7 +162,10 @@ def manage_snapshots( html_content = f.read() execution_time_start = time.perf_counter() - elements = Edgar10QParser().parse(html_content) + elements = Edgar10QParser().parse( + html_content, + include_irrelevant_elements=True, + ) execution_time_in_seconds = time.perf_counter() - execution_time_start if action == "update": diff --git a/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_top_level_section_title_check.py b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_top_level_section_title_check.py index e59ccc4..9b94eec 100644 --- a/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_top_level_section_title_check.py +++ b/tests/unit/processing_steps/individual_semantic_element_extractor/single_element_checks/test_top_level_section_title_check.py @@ -4,8 +4,8 @@ import pytest from sec_parser.processing_engine.html_tag import HtmlTag -from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_level_section_title_check import ( - TopLevelSectionTitleCheck, +from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_section_title_check import ( + TopSectionTitleCheck, ) from sec_parser.semantic_elements.abstract_semantic_element import ( AbstractSemanticElement, @@ -66,7 +66,7 @@ def test_top_level_section_title_check(name, html, root_tag, expected): bs4_tag = bs4.BeautifulSoup(html, "lxml").find(root_tag) assert isinstance(bs4_tag, bs4.Tag) html_tag = SemanticElement(HtmlTag(bs4_tag)) - check = TopLevelSectionTitleCheck() + check = TopSectionTitleCheck() # Act actual = check.contains_single_element(html_tag) diff --git a/tests/unit/semantic_elements/test_highlighted_text_element.py b/tests/unit/semantic_elements/test_highlighted_text_element.py index b2ebbc0..62bf7a3 100644 --- a/tests/unit/semantic_elements/test_highlighted_text_element.py +++ b/tests/unit/semantic_elements/test_highlighted_text_element.py @@ -38,7 +38,9 @@ def test_highlighted_text_element_from_element(): match="Style must be provided.", ): _ = HighlightedTextElement.create_from_element( - element, style=None, log_origin=None + element, + style=None, + log_origin=None, ) diff --git a/tests/unit/semantic_elements/test_top_level_section_title.py b/tests/unit/semantic_elements/test_top_level_section_title.py index 9aebed6..ec85f4d 100644 --- a/tests/unit/semantic_elements/test_top_level_section_title.py +++ b/tests/unit/semantic_elements/test_top_level_section_title.py @@ -2,8 +2,8 @@ from sec_parser.processing_engine.html_tag import HtmlTag from sec_parser.semantic_elements.highlighted_text_element import TextStyle -from sec_parser.semantic_elements.top_level_section_title import TopLevelSectionTitle -from sec_parser.semantic_elements.top_level_section_title_types import ( +from sec_parser.semantic_elements.top_section_title import TopSectionTitle +from sec_parser.semantic_elements.top_section_title_types import ( IDENTIFIER_TO_10Q_SECTION, ) @@ -15,7 +15,7 @@ def test_to_dict(): section_type = IDENTIFIER_TO_10Q_SECTION[identifier] # Act - actual = TopLevelSectionTitle(HtmlTag(tag), section_type=section_type).to_dict( + actual = TopSectionTitle(HtmlTag(tag), section_type=section_type).to_dict( include_previews=True, )