fix: preserve all elements when serialized; feat: helper functions fo…

…r serialization (#273) * added type to text element map * add element_id and coordinates * added test for serialization * added serialization for check boxes * add dict_to_elements and covert_to_dict aliases * helpers for serializing and deserializing elements * bump version; changelog * add Text to tests * aliases for isd functions * remove test elements json * changelog updates * make indent a kwarg * update expected structured output * docs update * use new function in ingest code * pop coordinates due to floating point differences * pop coordinates
Unstructured-IO · Feb 23, 2023 · 0d229f0 · 0d229f0
1 parent 354eff1
commit 0d229f0
Show file tree

Hide file tree

Showing 11 changed files with 635 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## 0.4.15
+
+### Enhancements
+
+* Added `elements_to_json` and `elements_from_json` for easier serialization/deserialization
+* `convert_to_dict`, `dict_to_elements` and `convert_to_csv` are now aliases for functions
+  that use the ISD terminology.
+
+### Fixes
+
+* Update to ensure all elements are preserved during serialization/deserialization
+
 ## 0.4.14
 
 * Automatically install `nltk` models in the `tokenize` module.

diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -887,33 +887,33 @@ Staging
 Staging bricks in ``unstructured`` prepare extracted text for downstream tasks such
 as machine learning inference and data labeling.
 
-``convert_to_isd``
-------------------
+``convert_to_dict``
+--------------------
 
-Converts outputs to the initial structured data (ISD) format. This is the default format
-for returning data in Unstructured pipeline APIs.
+Converts a list of ``Element`` objects to a dictionary. This is the default format
+for representing documents in ``unstructured``.
 
 Examples:
 
 .. code:: python
 
   from unstructured.documents.elements import Title, NarrativeText
-  from unstructured.staging.base import convert_to_isd
+  from unstructured.staging.base import convert_to_dict
 
   elements = [Title(text="Title"), NarrativeText(text="Narrative")]
-  isd = convert_to_isd(elements)
+  isd = convert_to_dict(elements)
 
 
-``isd_to_elements``
--------------------
+``dict_to_elements``
+---------------------
 
-Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
+Converts a dictionary of the format produced by ``convert_to_dict`` back to a list of ``Element`` objects.
 
 Examples:
 
 .. code:: python
 
-  from unstructured.staging.base import isd_to_elements
+  from unstructured.staging.base import dict_to_elements
 
   isd = [
     {"text": "My Title", "type": "Title"},
@@ -922,10 +922,10 @@ Examples:
 
   # elements will look like:
   # [ Title(text="My Title"), NarrativeText(text="My Narrative")]
-  elements = isd_to_elements(isd)
+  elements = dict_to_elements(isd)
 
 
-``convert_to_isd_csv``
+``convert_to_csv``
 ----------------------
 
 Converts outputs to the initial structured data (ISD) format as a CSV string.
@@ -935,10 +935,10 @@ Examples:
 .. code:: python
 
   from unstructured.documents.elements import Title, NarrativeText
-  from unstructured.staging.base import convert_to_isd_csv
+  from unstructured.staging.base import convert_to_csv
 
   elements = [Title(text="Title"), NarrativeText(text="Narrative")]
-  isd_csv = convert_to_isd_csv(elements)
+  isd_csv = convert_to_csv(elements)
 
 
 ``convert_to_dataframe``

diff --git a/docs/source/elements.rst b/docs/source/elements.rst
@@ -44,4 +44,31 @@ Examples:
   item.apply(*cleaners)
 
   # The output will be: Учебник по крокодильным средам обитания
-  print(item)
+  print(item)
+
+####################
+Serializing Elements
+####################
+
+The ``unstructured`` library includes helper functions for
+reading and writing a list of ``Element`` objects to and
+from JSON. You can use the following workflow for
+serializing and deserializing an ``Element`` list.
+
+
+.. code:: python
+
+    from unstructured.documents.elements import ElementMetadata, Text, Title, FigureCaption
+    from unstructured.staging.base import elements_to_json, elements_from_json
+
+    filename = "my-elements.json"
+    metadata = ElementMetadata(filename="fake-file.txt")
+    elements = [
+        FigureCaption(text="caption", metadata=metadata, element_id="1"),
+        Title(text="title", metadata=metadata, element_id="2"),
+        Text(text="title", metadata=metadata, element_id="3"),
+
+    ]
+
+    elements_to_json(elements, filename=filename)
+    new_elements = elements_from_json(filename=filename)
diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -8,7 +8,18 @@
 
 import unstructured.staging.base as base
 
-from unstructured.documents.elements import ElementMetadata, Title, NarrativeText, ListItem
+from unstructured.documents.elements import (
+    Address,
+    CheckBox,
+    ElementMetadata,
+    FigureCaption,
+    Title,
+    Text,
+    NarrativeText,
+    ListItem,
+    Image,
+    PageBreak,
+)
 
 
 @pytest.fixture
@@ -44,10 +55,10 @@ def test_isd_to_elements():
     ]
 
 
-def test_convert_to_isd_csv(output_csv_file):
+def test_convert_to_csv(output_csv_file):
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
     with open(output_csv_file, "w+") as csv_file:
-        isd_csv_string = base.convert_to_isd_csv(elements)
+        isd_csv_string = base.convert_to_csv(elements)
         csv_file.write(isd_csv_string)
 
     with open(output_csv_file, "r") as csv_file:
@@ -77,3 +88,41 @@ def test_convert_to_isd_serializes_with_posix_paths():
     output = base.convert_to_isd(elements)
     # NOTE(robinson) - json.dumps should run without raising an exception
     json.dumps(output)
+
+
+def test_all_elements_preserved_when_serialized():
+    metadata = ElementMetadata(filename="fake-file.txt")
+    elements = [
+        Address(text="address", metadata=metadata, element_id="1"),
+        CheckBox(checked=True, metadata=metadata, element_id="2"),
+        FigureCaption(text="caption", metadata=metadata, element_id="3"),
+        Title(text="title", metadata=metadata, element_id="4"),
+        NarrativeText(text="narrative", metadata=metadata, element_id="5"),
+        ListItem(text="list", metadata=metadata, element_id="6"),
+        Image(text="image", metadata=metadata, element_id="7"),
+        Text(text="text", metadata=metadata, element_id="8"),
+        PageBreak(),
+    ]
+
+    isd = base.convert_to_isd(elements)
+    assert base.convert_to_isd(base.isd_to_elements(isd)) == isd
+
+
+def test_serialized_deserialize_elements_to_json(tmpdir):
+    filename = os.path.join(tmpdir, "fake-elements.json")
+    metadata = ElementMetadata(filename="fake-file.txt")
+    elements = [
+        Address(text="address", metadata=metadata, element_id="1"),
+        CheckBox(checked=True, metadata=metadata, element_id="2"),
+        FigureCaption(text="caption", metadata=metadata, element_id="3"),
+        Title(text="title", metadata=metadata, element_id="4"),
+        NarrativeText(text="narrative", metadata=metadata, element_id="5"),
+        ListItem(text="list", metadata=metadata, element_id="6"),
+        Image(text="image", metadata=metadata, element_id="7"),
+        Text(text="text", metadata=metadata, element_id="8"),
+        PageBreak(),
+    ]
+
+    base.elements_to_json(elements, filename=filename)
+    new_elements = base.elements_from_json(filename=filename)
+    assert elements == new_elements