diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 92c0e8c6..b1594a35 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -765,7 +765,7 @@ def entities_to_bigquery( ) def split_pdf(self, pdf_path: str, output_path: str) -> List[str]: - r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor. + r"""Splits local PDF file into multiple PDF files based on output from a Splitter processor. Args: pdf_path (str): @@ -776,6 +776,8 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]: List[str]: A list of output pdf files. """ + if self.entities[0].start_page is None or self.entities[0].end_page is None: + raise ValueError("Entities do not contain start or end pages.") output_files: List[str] = [] input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path)) with Pdf.open(pdf_path) as pdf: diff --git a/google/cloud/documentai_toolbox/wrappers/entity.py b/google/cloud/documentai_toolbox/wrappers/entity.py index 2ccb6530..02b2e1ba 100644 --- a/google/cloud/documentai_toolbox/wrappers/entity.py +++ b/google/cloud/documentai_toolbox/wrappers/entity.py @@ -40,45 +40,46 @@ class Entity: Required. Entity type from a schema e.g. "Address". mention_text (str): Optional. Text value in the document e.g. "1600 Amphitheatre Pkwy". - If the entity is not present in - the document, this field will be empty. + Only populated for Extraction processors. normalized_text (str): Optional. Normalized text value in the document e.g. "1970-01-01". - If the entity is not present in - the document, this field will be empty. + Only populated for Extraction processors. start_page (int): - Required. `Page` containing the `Entity` or the first page of the - classification (for Splitter/Classifier processors). + Optional. `Page` containing the `Entity` for Extraction processors or the first page of the + subdocument for Splitter processors. end_page (int): - Required. Last page of the classification + Optional. Last page of the subdocument for Splitter processors. """ documentai_object: documentai.Document.Entity = dataclasses.field(repr=False) page_offset: dataclasses.InitVar[Optional[int]] = 0 type_: str = dataclasses.field(init=False) - mention_text: str = dataclasses.field(init=False, default="") - normalized_text: str = dataclasses.field(init=False, default="") + mention_text: Optional[str] = dataclasses.field(init=False, default=None) + normalized_text: Optional[str] = dataclasses.field(init=False, default=None) - start_page: int = dataclasses.field(init=False) - # Only Populated for Splitter/Classifier Output - end_page: int = dataclasses.field(init=False) + start_page: Optional[int] = dataclasses.field(init=False, default=None) + end_page: Optional[int] = dataclasses.field(init=False, default=None) _image: Optional[Image.Image] = dataclasses.field(init=False, default=None) def __post_init__(self, page_offset: int) -> None: self.type_ = self.documentai_object.type_ - self.mention_text = self.documentai_object.mention_text + + if self.documentai_object.mention_text: + self.mention_text = self.documentai_object.mention_text + if ( self.documentai_object.normalized_value and self.documentai_object.normalized_value.text ): self.normalized_text = self.documentai_object.normalized_value.text - page_refs = self.documentai_object.page_anchor.page_refs - if page_refs: - self.start_page = int(page_refs[0].page) + page_offset - self.end_page = int(page_refs[-1].page) + page_offset + if self.documentai_object.page_anchor: + page_refs = self.documentai_object.page_anchor.page_refs + if page_refs: + self.start_page = int(page_refs[0].page) + page_offset + self.end_page = int(page_refs[-1].page) + page_offset def crop_image( self, documentai_page: documentai.Document.Page diff --git a/tests/unit/resources/classifier/custom_classifier_output.json b/tests/unit/resources/classifier/custom_classifier_output.json new file mode 100644 index 00000000..1cd42eb0 --- /dev/null +++ b/tests/unit/resources/classifier/custom_classifier_output.json @@ -0,0 +1 @@ +{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n(*)\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 * 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]} diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 6949d9df..22bb2258 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -79,6 +79,13 @@ def get_bytes_splitter_mock(): yield byte_factory +@pytest.fixture +def get_bytes_classifier_mock(): + with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: + byte_factory.return_value = get_bytes("tests/unit/resources/classifier") + yield byte_factory + + @pytest.fixture def get_bytes_images_mock(): with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory: @@ -206,6 +213,30 @@ def test_entities_from_shards_with_hex_ids(): assert actual[1].type_ == "class_international" +def test_entities_from_shards_classifier(get_bytes_classifier_mock): + shards = document._get_shards( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/" + ) + get_bytes_classifier_mock.assert_called_once() + + actual = document._entities_from_shards(shards=shards) + + # Check for error reported in https://github.com/googleapis/python-documentai-toolbox/issues/332 + assert repr(actual) + assert actual[0].type_ == "computer_vision" + assert round(actual[0].documentai_object.confidence, 8) == 0.47925246 + assert actual[0].documentai_object.id == "0" + assert actual[1].type_ == "crypto" + assert round(actual[1].documentai_object.confidence, 8) == 0.0433604 + assert actual[1].documentai_object.id == "1" + assert actual[2].type_ == "med_tech" + assert round(actual[2].documentai_object.confidence, 8) == 0.26732057 + assert actual[2].documentai_object.id == "2" + assert actual[3].type_ == "other" + assert round(actual[3].documentai_object.confidence, 8) == 0.2100666 + assert actual[3].documentai_object.id == "3" + + @mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") def test_get_batch_process_metadata_with_valid_operation( mock_docai, @@ -703,6 +734,22 @@ def test_split_pdf(mock_Pdf, get_bytes_splitter_mock): ] +def test_split_pdf_with_non_splitter(get_bytes_classifier_mock): + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + + with pytest.raises( + ValueError, + match="Entities do not contain start or end pages.", + ): + doc.split_pdf( + pdf_path="procurement_multi_document.pdf", output_path="splitter/output/" + ) + + get_bytes_classifier_mock.assert_called_once() + + def test_convert_document_to_annotate_file_response(): doc = document.Document.from_document_path( document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" diff --git a/tests/unit/test_entity.py b/tests/unit/test_entity.py index c382d4e8..148d66c0 100644 --- a/tests/unit/test_entity.py +++ b/tests/unit/test_entity.py @@ -68,6 +68,21 @@ def test_Entity_splitter(): assert wrapper_entity.end_page == 2 +def test_Entity_classifier(): + documentai_entity = documentai.Document.Entity( + type_="clinical_notes", + id="0", + confidence=0.99878639, + ) + wrapper_entity = entity.Entity(documentai_entity) + assert wrapper_entity.type_ == "clinical_notes" + assert wrapper_entity.documentai_object.id == "0" + assert round(wrapper_entity.documentai_object.confidence, 8) == 0.99878639 + assert not wrapper_entity.mention_text + assert not wrapper_entity.start_page + assert not wrapper_entity.end_page + + def test_Entity_with_page_offset(): documentai_entity = documentai.Document.Entity( type_="invoice_statement",