From 5df7bb6e813a4c755eff7a1ef9f5a7ba0665afd2 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 09:16:41 -0600 Subject: [PATCH 1/6] fix: Changed `client_info` import `google.api_core.client_info` -> `google.api_core.gapic_v1.client_info` Fixes #266 --- google/cloud/documentai_toolbox/utilities/gcs_utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 0c5ff3e2..077ca19a 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -18,7 +18,7 @@ import re from typing import Dict, List, Optional, Tuple -from google.api_core import client_info +from google.api_core.gapic_v1 import client_info from google.cloud import documentai, documentai_toolbox, storage from google.cloud.documentai_toolbox import constants From 2192c042be366bd4222932ecb0783aadfe25c70f Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 10:05:37 -0600 Subject: [PATCH 2/6] re-add client options/api_location endpoint for batch_process_operation --- .../documentai_toolbox/wrappers/document.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 6a49ed49..8026ba36 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -22,6 +22,7 @@ import re from typing import Dict, List, Optional, Type, Union +from google.api_core.client_options import ClientOptions from google.api_core.operation import from_gapic as operation_from_gapic from google.cloud.vision import AnnotateFileResponse from google.longrunning.operations_pb2 import GetOperationRequest @@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume def _get_batch_process_metadata( operation_name: str, + location: Optional[str] = None, timeout: Optional[float] = None, ) -> documentai.BatchProcessMetadata: r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation. @@ -146,6 +148,10 @@ def _get_batch_process_metadata( operation_name (str): Required. The fully qualified operation name for a `batch_process_documents()` operation. + location (str): + Optional. The location of the processor used for `batch_process_documents()`. + Deprecated. Maintained for backwards compatibility. + timeout (float): Optional. Default None. Time in seconds to wait for operation to complete. If None, will wait indefinitely. @@ -153,8 +159,21 @@ def _get_batch_process_metadata( documentai.BatchProcessMetadata: Metadata from batch process. """ + # Validate Operation Name + match = re.search( + r"\/projects\/\w+\/locations\/(\w+)\/operations\/\w+\/", operation_name + ) + + if not match: + raise ValueError(f"Invalid Operation Name: {operation_name}") + + location = location or match.group(1) + client = documentai.DocumentProcessorServiceClient( client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"), + client_options=ClientOptions( + api_endpoint=f"{location}-documentai.googleapis.com" + ), ) # Poll Operation until complete. @@ -599,6 +618,7 @@ def from_batch_process_operation( return cls.from_batch_process_metadata( metadata=_get_batch_process_metadata( operation_name=operation_name, + location=location, timeout=timeout, ) ) From f05e25e743acfdd18ee99600b08d443b86eb51d9 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 10:40:27 -0600 Subject: [PATCH 3/6] Fix Regex Test and metadata input --- google/cloud/documentai_toolbox/wrappers/document.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 8026ba36..1e836e2b 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -161,7 +161,7 @@ def _get_batch_process_metadata( """ # Validate Operation Name match = re.search( - r"\/projects\/\w+\/locations\/(\w+)\/operations\/\w+\/", operation_name + r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name ) if not match: @@ -180,7 +180,6 @@ def _get_batch_process_metadata( operation = operation_from_gapic( operation=client.get_operation( request=GetOperationRequest(name=operation_name), - metadata=documentai.BatchProcessMetadata(), ), operations_client=client, result_type=documentai.BatchProcessResponse, From 0cfd7011ed0b0a297bd9dbbf10a20755d8e9c333 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 10:41:05 -0600 Subject: [PATCH 4/6] Add examples for all document initialization options (w/tests) --- samples/snippets/quickstart_sample.py | 85 ++++++++++++++----- samples/snippets/test_quickstart_sample.py | 95 +++++++++++++++++++++- 2 files changed, 157 insertions(+), 23 deletions(-) diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index f0d3a998..0cea14db 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -15,41 +15,88 @@ # [START documentai_toolbox_quickstart] +from typing import Optional +from google.cloud import documentai from google.cloud.documentai_toolbox import document from google.cloud.documentai_toolbox import gcs_utilities # TODO(developer): Uncomment these variables before running the sample. -# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder +# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder # gcs_bucket_name = "bucket" # gcs_prefix = "path/to/folder" +# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json +# gcs_uri = "gs://bucket/path/to/folder/document.json" + +# Or, given a Document JSON in path local/path/to/folder/document.json +# document_path = "local/path/to/folder/document.json" + +# Or, given a Document object from Document AI +# documentai_document = documentai.Document() + +# Or, given a BatchProcessMetadata object from Document AI +# operation = client.batch_process_documents(request) +# operation.result(timeout=timeout) +# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata) + +# Or, given a BatchProcessOperation name from Document AI +# batch_process_operation = "projects/project_id/locations/location/operations/operation_id" + + +def quickstart_sample( + gcs_bucket_name: Optional[str] = None, + gcs_prefix: Optional[str] = None, + gcs_uri: Optional[str] = None, + document_path: Optional[str] = None, + documentai_document: Optional[documentai.Document] = None, + batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None, + batch_process_operation: Optional[str] = None, +) -> None: + if gcs_bucket_name and gcs_prefix: + # Load from Google Cloud Storage Directory + print("Document structure in Cloud Storage") + gcs_utilities.print_gcs_document_tree( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + + wrapped_document = document.Document.from_gcs( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + elif gcs_uri: + # Load a single Document from a Google Cloud Storage URI + wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri) + elif document_path: + # Load from local `Document` JSON file + wrapped_document = document.Document.from_document_path(document_path) + elif documentai_document: + # Load from `documentai.Document` object + wrapped_document = document.Document.from_documentai_document( + documentai_document + ) + elif batch_process_metadata: + # Load Documents from `BatchProcessMetadata` object + wrapped_documents = document.Document.from_batch_process_metadata( + metadata=batch_process_metadata + ) + wrapped_document = wrapped_documents[0] + elif batch_process_operation: + wrapped_documents = document.Document.from_batch_process_operation( + location="us", operation_name=batch_process_operation + ) + wrapped_document = wrapped_documents[0] + else: + raise ValueError("No document source provided.") -def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: - print("Document structure in Cloud Storage") - gcs_utilities.print_gcs_document_tree( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix - ) - - wrapped_document = document.Document.from_gcs( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix - ) # For all properties and methods, refer to: # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document - # Alternatively, create wrapped document from: - # - # - Local `Document` JSON file: `document.Document.from_document_path()` - # - `Document` object: `document.Document.from_documentai_document()` - # - `BatchProcessMetadata`: `document.Document.from_batch_process_metadata()` - # - Batch Processing Operation: `document.Document.from_batch_process_operation()` - print("Document Successfully Loaded!") print(f"\t Number of Pages: {len(wrapped_document.pages)}") print(f"\t Number of Entities: {len(wrapped_document.entities)}") - for idx, page in enumerate(wrapped_document.pages): - print(f"Page {idx}") + for page in wrapped_document.pages: + print(f"Page {page.page_number}") for block in page.blocks: print(block.text) for paragraph in page.paragraphs: diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py index 912a27d8..cb7a9c4a 100644 --- a/samples/snippets/test_quickstart_sample.py +++ b/samples/snippets/test_quickstart_sample.py @@ -18,18 +18,105 @@ import pytest from samples.snippets import quickstart_sample +from google.cloud import documentai +from google.longrunning.operations_pb2 import ListOperationsRequest # type: ignore + location = "us" project_id = os.environ["GOOGLE_CLOUD_PROJECT"] -gcs_bucket_name = "documentai_toolbox_samples" -gcs_input_uri = "output/123456789/0" -def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None: +def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None: + gcs_bucket_name = "documentai_toolbox_samples" + gcs_prefix = "output/123456789/0" quickstart_sample.quickstart_sample( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix ) out, _ = capsys.readouterr() assert "Document structure in Cloud Storage" in out assert "Number of Pages: 1" in out assert "Number of Entities: 35" in out + + +def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None: + gcs_uri = ( + "gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json" + ) + quickstart_sample.quickstart_sample(gcs_uri=gcs_uri) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 35" in out + + +def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None: + document_path = "resources/form_with_tables.json" + quickstart_sample.quickstart_sample(document_path=document_path) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 0" in out + assert "Form Date" in out + + +def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None: + with open("resources/form_with_tables.json", encoding="utf-8") as f: + documentai_document = documentai.Document.from_json( + f.read(), ignore_unknown_fields=True + ) + + quickstart_sample.quickstart_sample(documentai_document=documentai_document) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 0" in out + assert "Form Date" in out + + +def test_quickstart_sample_batch_process_metadata( + capsys: pytest.CaptureFixture, +) -> None: + client = documentai.DocumentProcessorServiceClient() + name = f"{client.common_location_path(project=project_id, location=location)}/operations" + response = client.list_operations( + request=ListOperationsRequest( + name=name, + filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE", + page_size=1, + ) + ) + batch_process_metadata = documentai.BatchProcessMetadata.deserialize( + response.operations[0].metadata.value + ) + + quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata) + + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + +def test_quickstart_sample_batch_process_operation( + capsys: pytest.CaptureFixture, +) -> None: + client = documentai.DocumentProcessorServiceClient() + name = f"{client.common_location_path(project=project_id, location=location)}/operations" + response = client.list_operations( + request=ListOperationsRequest( + name=name, + filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE", + page_size=1, + ) + ) + batch_process_operation = response.operations[0].name + + quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation) + + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + +def test_quickstart_sample_no_input() -> None: + with pytest.raises(ValueError, match="No document source provided."): + quickstart_sample.quickstart_sample() From c6b1afdcc15c24465cb86f314b9134cd3a5e316d Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 11:37:26 -0600 Subject: [PATCH 5/6] Add the expected format in `_get_batch_process_metadata` --- google/cloud/documentai_toolbox/wrappers/document.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 1e836e2b..7818a2fa 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -165,7 +165,10 @@ def _get_batch_process_metadata( ) if not match: - raise ValueError(f"Invalid Operation Name: {operation_name}") + raise ValueError( + f"Invalid Operation Name: {operation_name}\n" + "Expected operation name in the format `projects//locations//operations/`" + ) location = location or match.group(1) From cea3679a7337dc7edb795964f3cba307025fe2b6 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 11:39:51 -0600 Subject: [PATCH 6/6] Add unit test for get_batch_process_metadata with invalid operation name --- tests/unit/test_document.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 19c36ca4..e5ef5f1f 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): document._get_batch_process_metadata(operation_name) +def test_get_batch_process_metadata_with_invalid_operation_name(): + with pytest.raises( + ValueError, + match="Invalid Operation Name", + ): + document._get_batch_process_metadata( + "projects//locations/us/operations/7890123" + ) + + def test_bigquery_column_name(): string_map = { "Phone #:": "phone_num",