aws-samples · Belval · Jun 20, 2024 · Jun 20, 2024
diff --git a/tests/fixtures/receipt_no_summary.png b/tests/fixtures/receipt_no_summary.png
diff --git a/tests/fixtures/saved_api_responses/test_analyze_expense_no_summary_fields.json b/tests/fixtures/saved_api_responses/test_analyze_expense_no_summary_fields.json
diff --git a/tests/test_analyze_expense.py b/tests/test_analyze_expense.py
@@ -41,7 +41,7 @@ def test_analyze_expense_from_path(self):
         self.assertEqual(len(document.expense_documents[0].summary_groups.VENDOR), 2)
         self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4,
                          "There are 4 line item in the receipts")
-    
+
     def test_analyze_expense_from_image(self):
         # Testing local single image input
         if os.environ.get("CALL_TEXTRACT"):
@@ -58,6 +58,39 @@ def test_analyze_expense_from_image(self):
         self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4,
                          "There are 4 line item in the receipts")
 
+
+class TestTextractorAnalyzeExpenseNoSummary(unittest.TestCase):
+    def setUp(self):
+        # insert credentials and filepaths here to run test
+        self.profile_name = "default"
+        self.current_directory = os.path.abspath(os.path.dirname(__file__))
+        self.image_path = os.path.join(self.current_directory, "fixtures/receipt_no_summary.png")
+
+        if self.profile_name is None:
+            raise InvalidProfileNameError(
+                "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
+            )
+        if os.environ.get("CALL_TEXTRACT"):
+            self.extractor = Textractor(
+                profile_name=self.profile_name, kms_key_id=""
+            )
+
+    def test_analyze_expense_no_summary_fields(self):
+        """Correctly load expense line items where no summary fields were recognized
+
+        Per: https://github.com/aws-samples/amazon-textract-textractor/issues/370
+        """
+        if os.environ.get("CALL_TEXTRACT"):
+            document = self.extractor.analyze_expense(file_source=self.image_path)
+            with open(get_fixture_path(), "w") as f:
+                json.dump(document.response, f)
+        else:
+            document = Document.open(get_fixture_path())
+
+        self.assertIsInstance(document, Document)
+        self.assertEqual(len(document.expense_documents), 1)
+        self.assertGreater(len(document.expense_documents[0].line_items_groups), 0)
+
 if __name__ == "__main__":
     test = TestTextractorAnalyzeExpense()
     test.setUp()

diff --git a/textractor/parsers/response_parser.py b/textractor/parsers/response_parser.py
@@ -1477,10 +1477,21 @@ def parser_analyze_expense_response(response):
     ]
     document = parse_document_api_response(response)
     for doc in response["ExpenseDocuments"]:
-        # FIXME
-        if len(doc["SummaryFields"]) == 0:
+        page_number = None
+        if len(doc["SummaryFields"]):
+            page_number = doc["SummaryFields"][0].get("PageNumber")
+        elif len(doc["LineItemGroups"]):
+            # A group must have at least one LineItem, and a LI must have at least one field:
+            first_field = doc["LineItemGroups"][0]["LineItems"][0]["LineItemExpenseFields"][0]
+            page_number = first_field.get("PageNumber")
+        if page_number is None:
+            logging.warning(
+                "Skipping parsing ExpenseDocument %s as its page number could not be determined"
+                % (doc["ExpenseIndex"],)
+            )
             continue
-        page = document.pages[doc["SummaryFields"][0]["PageNumber"] - 1]
+
+        page = document.pages[page_number - 1]
         summary_fields = []
         for summary_field in doc["SummaryFields"]:
             summary_fields.append(create_expense_from_field(summary_field, page))
@@ -1525,7 +1536,7 @@ def parser_analyze_expense_response(response):
             page=page.page_num,
         )
         expense_document.raw_object = doc
-        document.pages[summary_field["PageNumber"] - 1].expense_documents.append(
+        document.pages[page_number - 1].expense_documents.append(
             expense_document
         )
     document.response = response