googleapis · holtskinner · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py
@@ -18,7 +18,7 @@
 import re
 from typing import Dict, List, Optional, Tuple
 
-from google.api_core import client_info
+from google.api_core.gapic_v1 import client_info
 
 from google.cloud import documentai, documentai_toolbox, storage
 from google.cloud.documentai_toolbox import constants

diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -22,6 +22,7 @@
 import re
 from typing import Dict, List, Optional, Type, Union
 
+from google.api_core.client_options import ClientOptions
 from google.api_core.operation import from_gapic as operation_from_gapic
 from google.cloud.vision import AnnotateFileResponse
 from google.longrunning.operations_pb2 import GetOperationRequest
@@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume
 
 def _get_batch_process_metadata(
     operation_name: str,
+    location: Optional[str] = None,
     timeout: Optional[float] = None,
 ) -> documentai.BatchProcessMetadata:
     r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation.
@@ -146,22 +148,41 @@ def _get_batch_process_metadata(
         operation_name (str):
             Required. The fully qualified operation name for a `batch_process_documents()` operation.
 
+        location (str):
+                Optional. The location of the processor used for `batch_process_documents()`.
+                Deprecated. Maintained for backwards compatibility.
+
         timeout (float):
             Optional. Default None. Time in seconds to wait for operation to complete.
             If None, will wait indefinitely.
     Returns:
         documentai.BatchProcessMetadata:
             Metadata from batch process.
     """
+    # Validate Operation Name
+    match = re.search(
+        r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name
+    )
+
+    if not match:
+        raise ValueError(
+            f"Invalid Operation Name: {operation_name}\n"
+            "Expected operation name in the format `projects/<project>/locations/<location>/operations/<operation>`"
+        )
+
+    location = location or match.group(1)
+
     client = documentai.DocumentProcessorServiceClient(
         client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"),
+        client_options=ClientOptions(
+            api_endpoint=f"{location}-documentai.googleapis.com"
+        ),
     )
 
     # Poll Operation until complete.
     operation = operation_from_gapic(
         operation=client.get_operation(
             request=GetOperationRequest(name=operation_name),
-            metadata=documentai.BatchProcessMetadata(),
         ),
         operations_client=client,
         result_type=documentai.BatchProcessResponse,
@@ -599,6 +620,7 @@ def from_batch_process_operation(
         return cls.from_batch_process_metadata(
             metadata=_get_batch_process_metadata(
                 operation_name=operation_name,
+                location=location,
                 timeout=timeout,
             )
         )

@@ -15,41 +15,88 @@
 
 
 # [START documentai_toolbox_quickstart]
+from typing import Optional
 
+from google.cloud import documentai
 from google.cloud.documentai_toolbox import document
 from google.cloud.documentai_toolbox import gcs_utilities
 
 # TODO(developer): Uncomment these variables before running the sample.
-# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
+# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
 # gcs_bucket_name = "bucket"
 # gcs_prefix = "path/to/folder"
 
+# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
+# gcs_uri = "gs://bucket/path/to/folder/document.json"
+
+# Or, given a Document JSON in path local/path/to/folder/document.json
+# document_path = "local/path/to/folder/document.json"
+
+# Or, given a Document object from Document AI
+# documentai_document = documentai.Document()
+
+# Or, given a BatchProcessMetadata object from Document AI
+# operation = client.batch_process_documents(request)
+# operation.result(timeout=timeout)
+# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)
+
+# Or, given a BatchProcessOperation name from Document AI
+# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"
+
+
+def quickstart_sample(
+    gcs_bucket_name: Optional[str] = None,
+    gcs_prefix: Optional[str] = None,
+    gcs_uri: Optional[str] = None,
+    document_path: Optional[str] = None,
+    documentai_document: Optional[documentai.Document] = None,
+    batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
+    batch_process_operation: Optional[str] = None,
+) -> None:
+    if gcs_bucket_name and gcs_prefix:
+        # Load from Google Cloud Storage Directory
+        print("Document structure in Cloud Storage")
+        gcs_utilities.print_gcs_document_tree(
+            gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
+        )
+
+        wrapped_document = document.Document.from_gcs(
+            gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
+        )
+    elif gcs_uri:
+        # Load a single Document from a Google Cloud Storage URI
+        wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
+    elif document_path:
+        # Load from local `Document` JSON file
+        wrapped_document = document.Document.from_document_path(document_path)
+    elif documentai_document:
+        # Load from `documentai.Document` object
+        wrapped_document = document.Document.from_documentai_document(
+            documentai_document
+        )
+    elif batch_process_metadata:
+        # Load Documents from `BatchProcessMetadata` object
+        wrapped_documents = document.Document.from_batch_process_metadata(
+            metadata=batch_process_metadata
+        )
+        wrapped_document = wrapped_documents[0]
+    elif batch_process_operation:
+        wrapped_documents = document.Document.from_batch_process_operation(
+            location="us", operation_name=batch_process_operation
+        )
+        wrapped_document = wrapped_documents[0]
+    else:
+        raise ValueError("No document source provided.")
 
-def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:
-    print("Document structure in Cloud Storage")
-    gcs_utilities.print_gcs_document_tree(
-        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
-    )
-
-    wrapped_document = document.Document.from_gcs(
-        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
-    )
     # For all properties and methods, refer to:
     # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document
 
-    # Alternatively, create wrapped document from:
-    #
-    # - Local `Document` JSON file:     `document.Document.from_document_path()`
-    # - `Document` object:              `document.Document.from_documentai_document()`
-    # - `BatchProcessMetadata`:         `document.Document.from_batch_process_metadata()`
-    # - Batch Processing Operation:     `document.Document.from_batch_process_operation()`
-
     print("Document Successfully Loaded!")
     print(f"\t Number of Pages: {len(wrapped_document.pages)}")
     print(f"\t Number of Entities: {len(wrapped_document.entities)}")
 
-    for idx, page in enumerate(wrapped_document.pages):
-        print(f"Page {idx}")
+    for page in wrapped_document.pages:
+        print(f"Page {page.page_number}")
         for block in page.blocks:
             print(block.text)
         for paragraph in page.paragraphs:

@@ -18,18 +18,105 @@
 import pytest
 from samples.snippets import quickstart_sample
 
+from google.cloud import documentai
+from google.longrunning.operations_pb2 import ListOperationsRequest  # type: ignore
+
 location = "us"
 project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
-gcs_bucket_name = "documentai_toolbox_samples"
-gcs_input_uri = "output/123456789/0"
 
 
-def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None:
+def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None:
+    gcs_bucket_name = "documentai_toolbox_samples"
+    gcs_prefix = "output/123456789/0"
     quickstart_sample.quickstart_sample(
-        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri
+        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
     )
     out, _ = capsys.readouterr()
 
     assert "Document structure in Cloud Storage" in out
     assert "Number of Pages: 1" in out
     assert "Number of Entities: 35" in out
+
+
+def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:
+    gcs_uri = (
+        "gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json"
+    )
+    quickstart_sample.quickstart_sample(gcs_uri=gcs_uri)
+    out, _ = capsys.readouterr()
+
+    assert "Number of Pages: 1" in out
+    assert "Number of Entities: 35" in out
+
+
+def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
+    document_path = "resources/form_with_tables.json"
+    quickstart_sample.quickstart_sample(document_path=document_path)
+    out, _ = capsys.readouterr()
+
+    assert "Number of Pages: 1" in out
+    assert "Number of Entities: 0" in out
+    assert "Form Date" in out
+
+
+def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None:
+    with open("resources/form_with_tables.json", encoding="utf-8") as f:
+        documentai_document = documentai.Document.from_json(
+            f.read(), ignore_unknown_fields=True
+        )
+
+    quickstart_sample.quickstart_sample(documentai_document=documentai_document)
+    out, _ = capsys.readouterr()
+
+    assert "Number of Pages: 1" in out
+    assert "Number of Entities: 0" in out
+    assert "Form Date" in out
+
+
+def test_quickstart_sample_batch_process_metadata(
+    capsys: pytest.CaptureFixture,
+) -> None:
+    client = documentai.DocumentProcessorServiceClient()
+    name = f"{client.common_location_path(project=project_id, location=location)}/operations"
+    response = client.list_operations(
+        request=ListOperationsRequest(
+            name=name,
+            filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
+            page_size=1,
+        )
+    )
+    batch_process_metadata = documentai.BatchProcessMetadata.deserialize(
+        response.operations[0].metadata.value
+    )
+
+    quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata)
+
+    out, _ = capsys.readouterr()
+
+    assert "Document Successfully Loaded!" in out
+
+
+def test_quickstart_sample_batch_process_operation(
+    capsys: pytest.CaptureFixture,
+) -> None:
+    client = documentai.DocumentProcessorServiceClient()
+    name = f"{client.common_location_path(project=project_id, location=location)}/operations"
+    response = client.list_operations(
+        request=ListOperationsRequest(
+            name=name,
+            filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
+            page_size=1,
+        )
+    )
+    batch_process_operation = response.operations[0].name
+
+    quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation)
+
+    out, _ = capsys.readouterr()
+
+    assert "Document Successfully Loaded!" in out
+
+
+def test_quickstart_sample_no_input() -> None:
+    with pytest.raises(ValueError, match="No document source provided."):
+        quickstart_sample.quickstart_sample()
diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
@@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai):
         document._get_batch_process_metadata(operation_name)
 
 
+def test_get_batch_process_metadata_with_invalid_operation_name():
+    with pytest.raises(
+        ValueError,
+        match="Invalid Operation Name",
+    ):
+        document._get_batch_process_metadata(
+            "projects//locations/us/operations/7890123"
+        )
+
+
 def test_bigquery_column_name():
     string_map = {
         "Phone #:": "phone_num",