Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ def entities_to_bigquery(
)

def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
r"""Splits local PDF file into multiple PDF files based on output from a Splitter processor.

Args:
pdf_path (str):
Expand All @@ -776,6 +776,8 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
List[str]:
A list of output pdf files.
"""
if self.entities[0].start_page is None or self.entities[0].end_page is None:
raise ValueError("Entities do not contain start or end pages.")
output_files: List[str] = []
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
with Pdf.open(pdf_path) as pdf:
Expand Down
35 changes: 18 additions & 17 deletions google/cloud/documentai_toolbox/wrappers/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,45 +40,46 @@ class Entity:
Required. Entity type from a schema e.g. "Address".
mention_text (str):
Optional. Text value in the document e.g. "1600 Amphitheatre Pkwy".
If the entity is not present in
the document, this field will be empty.
Only populated for Extraction processors.
normalized_text (str):
Optional. Normalized text value in the document e.g. "1970-01-01".
If the entity is not present in
the document, this field will be empty.
Only populated for Extraction processors.
start_page (int):
Required. `Page` containing the `Entity` or the first page of the
classification (for Splitter/Classifier processors).
Optional. `Page` containing the `Entity` for Extraction processors or the first page of the
subdocument for Splitter processors.
end_page (int):
Required. Last page of the classification
Optional. Last page of the subdocument for Splitter processors.
"""

documentai_object: documentai.Document.Entity = dataclasses.field(repr=False)
page_offset: dataclasses.InitVar[Optional[int]] = 0

type_: str = dataclasses.field(init=False)
mention_text: str = dataclasses.field(init=False, default="")
normalized_text: str = dataclasses.field(init=False, default="")
mention_text: Optional[str] = dataclasses.field(init=False, default=None)
normalized_text: Optional[str] = dataclasses.field(init=False, default=None)

start_page: int = dataclasses.field(init=False)
# Only Populated for Splitter/Classifier Output
end_page: int = dataclasses.field(init=False)
start_page: Optional[int] = dataclasses.field(init=False, default=None)
end_page: Optional[int] = dataclasses.field(init=False, default=None)

_image: Optional[Image.Image] = dataclasses.field(init=False, default=None)

def __post_init__(self, page_offset: int) -> None:
self.type_ = self.documentai_object.type_
self.mention_text = self.documentai_object.mention_text

if self.documentai_object.mention_text:
Comment thread
holtskinner marked this conversation as resolved.
self.mention_text = self.documentai_object.mention_text

if (
self.documentai_object.normalized_value
and self.documentai_object.normalized_value.text
):
self.normalized_text = self.documentai_object.normalized_value.text

page_refs = self.documentai_object.page_anchor.page_refs
if page_refs:
self.start_page = int(page_refs[0].page) + page_offset
self.end_page = int(page_refs[-1].page) + page_offset
if self.documentai_object.page_anchor:
page_refs = self.documentai_object.page_anchor.page_refs
if page_refs:
self.start_page = int(page_refs[0].page) + page_offset
self.end_page = int(page_refs[-1].page) + page_offset

def crop_image(
self, documentai_page: documentai.Document.Page
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n(*)\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 * 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]}
47 changes: 47 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ def get_bytes_splitter_mock():
yield byte_factory


@pytest.fixture
def get_bytes_classifier_mock():
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
byte_factory.return_value = get_bytes("tests/unit/resources/classifier")
yield byte_factory


@pytest.fixture
def get_bytes_images_mock():
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
Expand Down Expand Up @@ -206,6 +213,30 @@ def test_entities_from_shards_with_hex_ids():
assert actual[1].type_ == "class_international"


def test_entities_from_shards_classifier(get_bytes_classifier_mock):
shards = document._get_shards(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"
)
get_bytes_classifier_mock.assert_called_once()

actual = document._entities_from_shards(shards=shards)

# Check for error reported in https://github.com/googleapis/python-documentai-toolbox/issues/332
assert repr(actual)
assert actual[0].type_ == "computer_vision"
assert round(actual[0].documentai_object.confidence, 8) == 0.47925246
assert actual[0].documentai_object.id == "0"
assert actual[1].type_ == "crypto"
assert round(actual[1].documentai_object.confidence, 8) == 0.0433604
assert actual[1].documentai_object.id == "1"
assert actual[2].type_ == "med_tech"
assert round(actual[2].documentai_object.confidence, 8) == 0.26732057
assert actual[2].documentai_object.id == "2"
assert actual[3].type_ == "other"
assert round(actual[3].documentai_object.confidence, 8) == 0.2100666
assert actual[3].documentai_object.id == "3"


@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
def test_get_batch_process_metadata_with_valid_operation(
mock_docai,
Expand Down Expand Up @@ -703,6 +734,22 @@ def test_split_pdf(mock_Pdf, get_bytes_splitter_mock):
]


def test_split_pdf_with_non_splitter(get_bytes_classifier_mock):
doc = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
)

with pytest.raises(
ValueError,
match="Entities do not contain start or end pages.",
):
doc.split_pdf(
pdf_path="procurement_multi_document.pdf", output_path="splitter/output/"
)

get_bytes_classifier_mock.assert_called_once()


def test_convert_document_to_annotate_file_response():
doc = document.Document.from_document_path(
document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"
Expand Down
15 changes: 15 additions & 0 deletions tests/unit/test_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ def test_Entity_splitter():
assert wrapper_entity.end_page == 2


def test_Entity_classifier():
documentai_entity = documentai.Document.Entity(
type_="clinical_notes",
id="0",
confidence=0.99878639,
)
wrapper_entity = entity.Entity(documentai_entity)
assert wrapper_entity.type_ == "clinical_notes"
assert wrapper_entity.documentai_object.id == "0"
assert round(wrapper_entity.documentai_object.confidence, 8) == 0.99878639
assert not wrapper_entity.mention_text
assert not wrapper_entity.start_page
assert not wrapper_entity.end_page

Comment thread
holtskinner marked this conversation as resolved.

def test_Entity_with_page_offset():
documentai_entity = documentai.Document.Entity(
type_="invoice_statement",
Expand Down