Skip to content

Commit

Permalink
[feat] add base64ai document redaction
Browse files Browse the repository at this point in the history
  • Loading branch information
Daggx committed Nov 13, 2023
1 parent 7aa8a79 commit bc2c296
Show file tree
Hide file tree
Showing 5 changed files with 14,981 additions and 22 deletions.
120 changes: 101 additions & 19 deletions edenai_apis/apis/base64/base64_api.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from itertools import zip_longest
import json
import uuid
from typing import Any, Dict, Sequence, Type, TypeVar, Union
from collections import defaultdict
import mimetypes
import base64
from enum import Enum
from proto import message
import requests
from edenai_apis.features.ocr.anonymization_async.anonymization_async_dataclass import AnonymizationAsyncDataClass
from edenai_apis.features.ocr.bank_check_parsing import (
BankCheckParsingDataClass,
MicrModel,
Expand All @@ -30,24 +31,14 @@
format_date,
)
from edenai_apis.features.ocr.invoice_parser import (
CustomerInformationInvoice,
InfosInvoiceParserDataClass,
InvoiceParserDataClass,
ItemLinesInvoice,
LocaleInvoice,
MerchantInformationInvoice,
TaxesInvoice,
BankInvoice,
CustomerInformationInvoice, InfosInvoiceParserDataClass, InvoiceParserDataClass,
ItemLinesInvoice, LocaleInvoice, MerchantInformationInvoice,
TaxesInvoice, BankInvoice,
)
from edenai_apis.features.ocr.receipt_parser import (
CustomerInformation,
InfosReceiptParserDataClass,
ItemLines,
Locale,
MerchantInformation,
ReceiptParserDataClass,
Taxes,
PaymentInformation,
CustomerInformation, InfosReceiptParserDataClass, ItemLines,
Locale, MerchantInformation, ReceiptParserDataClass,
Taxes, PaymentInformation,
)
from edenai_apis.features.image.face_compare import (
FaceCompareDataClass,
Expand All @@ -63,9 +54,14 @@
convert_string_to_number,
retreive_first_number_from_string,
)
from apis.amazon.helpers import check_webhook_result
from edenai_apis.utils.exception import ProviderException
from edenai_apis.utils.types import ResponseType

from edenai_apis.utils.types import (
AsyncBaseResponseType, AsyncLaunchJobResponseType,
ResponseType, AsyncPendingResponseType, AsyncResponseType
)
from edenai_apis.utils.upload_s3 import upload_file_bytes_to_s3, USER_PROCESS
from io import BytesIO

class SubfeatureParser(Enum):
RECEIPT = "receipt"
Expand All @@ -84,6 +80,8 @@ def __init__(self, api_keys: Dict = {}) -> None:
)
self.api_key = self.api_settings["secret"]
self.url = "https://base64.ai/api/scan"
self.webhook_settings = load_provider(ProviderDataEnum.KEY, "webhooksite")
self.webhook_token = self.webhook_settings.get("webhook_token")

class Field:
def __init__(self, document: dict) -> None:
Expand Down Expand Up @@ -666,3 +664,87 @@ def ocr__bank_check_parsing(
original_response=original_response,
standardized_response=BankCheckParsingDataClass(extracted_data=items),
)

def ocr__anonymization_async__launch_job(
self, file: str, file_url: str = ""
) -> AsyncLaunchJobResponseType:
data_job_id = {}
file_ = open(file, "rb")
image_as_base64 = (
f"data:{mimetypes.guess_type(file)[0]};base64,"
+ base64.b64encode(file_.read()).decode()
)
file_.close()
payload = json.dumps(
{
"image": image_as_base64,
"settings": {
"redactions": {
"fields": ["name", "givenName", "familyName", "organization", "documentNumber",
"address", "date", "dateOfBirth", "issueDate", "expirationDate", "vin"
"total", "tax"],
"faces": True,
"signatures": True,
}
},
}
)

headers = {"Content-Type": "application/json", "Authorization": self.api_key}

response = requests.post(url=self.url, headers=headers, data=payload)

original_response = self._get_response(response)

job_id = "document_anonymization_base64" + str(uuid.uuid4())
data_job_id[job_id] = original_response
requests.post(
url=f"https://webhook.site/{self.webhook_token}",
data=json.dumps(data_job_id),
headers={"content-type": "application/json"},
)

return AsyncLaunchJobResponseType(provider_job_id=job_id)

def ocr__anonymization_async__get_job_result(
self, provider_job_id: str
) -> AsyncBaseResponseType[AnonymizationAsyncDataClass]:
wehbook_result, response_status = check_webhook_result(provider_job_id, self.webhook_settings)

if response_status != 200:
raise ProviderException(wehbook_result, code = response_status)

result_object = next(filter(lambda response: provider_job_id in response["content"], wehbook_result), None) \
if wehbook_result else None

if not result_object or not result_object.get("content"):
raise ProviderException("Provider returned an empty response")

try:
original_response = json.loads(result_object["content"]).get(provider_job_id, None)
except json.JSONDecodeError:
raise ProviderException("An error occurred while parsing the response.")

if original_response is None:
return AsyncPendingResponseType[AnonymizationAsyncDataClass](
provider_job_id=provider_job_id
)
# Extract the B64 redacted document
redacted_document = original_response[0].get('redactedDocument')
# document_mimetype = original_response[0]['features']['properties']['mimeType']

# # Use the mimetypes module to guess the file extension based on the MIME type
# extension = mimetypes.guess_extension(document_mimetype)

# Extract the base64-encoded data from 'redacted_document'
base64_data = redacted_document.split(';base64,')[1]

content_bytes = base64.b64decode(base64_data)
resource_url = upload_file_bytes_to_s3(
BytesIO(content_bytes), ".png", USER_PROCESS
)
return AsyncResponseType[AnonymizationAsyncDataClass](
original_response=original_response,
standardized_response=AnonymizationAsyncDataClass(document=base64_data, document_url=resource_url),
provider_job_id=provider_job_id
)
14 changes: 14 additions & 0 deletions edenai_apis/apis/base64/info.json
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,20 @@
"application/zip"
]
}
},
"anonymization_async" : {
"constraints": {
"file_types": [
"image/jpeg",
"image/png",
"image/gif",
"image/x-ms-bmp",
"image/tiff",
"image/webp",
"image/heic"
]
},
"version" : "v1"
}
},
"image": {
Expand Down
Loading

0 comments on commit bc2c296

Please sign in to comment.