Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion google/cloud/documentai_toolbox/utilities/gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
from typing import Dict, List, Optional, Tuple

from google.api_core import client_info
from google.api_core.gapic_v1 import client_info

from google.cloud import documentai, documentai_toolbox, storage
from google.cloud.documentai_toolbox import constants
Expand Down
24 changes: 23 additions & 1 deletion google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import re
from typing import Dict, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.api_core.operation import from_gapic as operation_from_gapic
from google.cloud.vision import AnnotateFileResponse
from google.longrunning.operations_pb2 import GetOperationRequest
Expand Down Expand Up @@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume

def _get_batch_process_metadata(
operation_name: str,
location: Optional[str] = None,
timeout: Optional[float] = None,
) -> documentai.BatchProcessMetadata:
r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation.
Expand All @@ -146,22 +148,41 @@ def _get_batch_process_metadata(
operation_name (str):
Required. The fully qualified operation name for a `batch_process_documents()` operation.

location (str):
Optional. The location of the processor used for `batch_process_documents()`.
Deprecated. Maintained for backwards compatibility.

timeout (float):
Optional. Default None. Time in seconds to wait for operation to complete.
If None, will wait indefinitely.
Returns:
documentai.BatchProcessMetadata:
Metadata from batch process.
"""
# Validate Operation Name
match = re.search(
r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name
)

if not match:
raise ValueError(
f"Invalid Operation Name: {operation_name}\n"
"Expected operation name in the format `projects/<project>/locations/<location>/operations/<operation>`"
)

location = location or match.group(1)

client = documentai.DocumentProcessorServiceClient(
client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"),
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
),
)

# Poll Operation until complete.
operation = operation_from_gapic(
operation=client.get_operation(
request=GetOperationRequest(name=operation_name),
metadata=documentai.BatchProcessMetadata(),
),
operations_client=client,
result_type=documentai.BatchProcessResponse,
Expand Down Expand Up @@ -599,6 +620,7 @@ def from_batch_process_operation(
return cls.from_batch_process_metadata(
metadata=_get_batch_process_metadata(
operation_name=operation_name,
location=location,
timeout=timeout,
)
)
Expand Down
85 changes: 66 additions & 19 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,88 @@


# [START documentai_toolbox_quickstart]
from typing import Optional

from google.cloud import documentai
from google.cloud.documentai_toolbox import document
from google.cloud.documentai_toolbox import gcs_utilities

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"

# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
# gcs_uri = "gs://bucket/path/to/folder/document.json"

# Or, given a Document JSON in path local/path/to/folder/document.json
# document_path = "local/path/to/folder/document.json"

# Or, given a Document object from Document AI
# documentai_document = documentai.Document()

# Or, given a BatchProcessMetadata object from Document AI
# operation = client.batch_process_documents(request)
# operation.result(timeout=timeout)
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)

# Or, given a BatchProcessOperation name from Document AI
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"


def quickstart_sample(
gcs_bucket_name: Optional[str] = None,
gcs_prefix: Optional[str] = None,
gcs_uri: Optional[str] = None,
document_path: Optional[str] = None,
documentai_document: Optional[documentai.Document] = None,
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
batch_process_operation: Optional[str] = None,
) -> None:
if gcs_bucket_name and gcs_prefix:
# Load from Google Cloud Storage Directory
print("Document structure in Cloud Storage")
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
elif gcs_uri:
# Load a single Document from a Google Cloud Storage URI
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
elif document_path:
# Load from local `Document` JSON file
wrapped_document = document.Document.from_document_path(document_path)
elif documentai_document:
# Load from `documentai.Document` object
wrapped_document = document.Document.from_documentai_document(
documentai_document
)
elif batch_process_metadata:
# Load Documents from `BatchProcessMetadata` object
wrapped_documents = document.Document.from_batch_process_metadata(
metadata=batch_process_metadata
)
wrapped_document = wrapped_documents[0]
elif batch_process_operation:
wrapped_documents = document.Document.from_batch_process_operation(
location="us", operation_name=batch_process_operation
)
wrapped_document = wrapped_documents[0]
else:
raise ValueError("No document source provided.")

def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:
print("Document structure in Cloud Storage")
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
# For all properties and methods, refer to:
# https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document

# Alternatively, create wrapped document from:
#
# - Local `Document` JSON file: `document.Document.from_document_path()`
# - `Document` object: `document.Document.from_documentai_document()`
# - `BatchProcessMetadata`: `document.Document.from_batch_process_metadata()`
# - Batch Processing Operation: `document.Document.from_batch_process_operation()`

print("Document Successfully Loaded!")
print(f"\t Number of Pages: {len(wrapped_document.pages)}")
print(f"\t Number of Entities: {len(wrapped_document.entities)}")

for idx, page in enumerate(wrapped_document.pages):
print(f"Page {idx}")
for page in wrapped_document.pages:
print(f"Page {page.page_number}")
for block in page.blocks:
print(block.text)
for paragraph in page.paragraphs:
Expand Down
95 changes: 91 additions & 4 deletions samples/snippets/test_quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,105 @@
import pytest
from samples.snippets import quickstart_sample

from google.cloud import documentai
from google.longrunning.operations_pb2 import ListOperationsRequest # type: ignore

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
gcs_bucket_name = "documentai_toolbox_samples"
gcs_input_uri = "output/123456789/0"


def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None:
def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None:
gcs_bucket_name = "documentai_toolbox_samples"
gcs_prefix = "output/123456789/0"
quickstart_sample.quickstart_sample(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
out, _ = capsys.readouterr()

assert "Document structure in Cloud Storage" in out
assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out


def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:
gcs_uri = (
"gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json"
)
quickstart_sample.quickstart_sample(gcs_uri=gcs_uri)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out


def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
document_path = "resources/form_with_tables.json"
quickstart_sample.quickstart_sample(document_path=document_path)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out


def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None:
with open("resources/form_with_tables.json", encoding="utf-8") as f:
documentai_document = documentai.Document.from_json(
f.read(), ignore_unknown_fields=True
)

quickstart_sample.quickstart_sample(documentai_document=documentai_document)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out


def test_quickstart_sample_batch_process_metadata(
capsys: pytest.CaptureFixture,
) -> None:
client = documentai.DocumentProcessorServiceClient()
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
response = client.list_operations(
request=ListOperationsRequest(
name=name,
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
page_size=1,
)
)
batch_process_metadata = documentai.BatchProcessMetadata.deserialize(
response.operations[0].metadata.value
)

quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata)

out, _ = capsys.readouterr()

assert "Document Successfully Loaded!" in out


def test_quickstart_sample_batch_process_operation(
capsys: pytest.CaptureFixture,
) -> None:
client = documentai.DocumentProcessorServiceClient()
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
response = client.list_operations(
request=ListOperationsRequest(
name=name,
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
page_size=1,
)
)
batch_process_operation = response.operations[0].name

quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation)

out, _ = capsys.readouterr()

assert "Document Successfully Loaded!" in out


def test_quickstart_sample_no_input() -> None:
with pytest.raises(ValueError, match="No document source provided."):
quickstart_sample.quickstart_sample()
10 changes: 10 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai):
document._get_batch_process_metadata(operation_name)


def test_get_batch_process_metadata_with_invalid_operation_name():
with pytest.raises(
ValueError,
match="Invalid Operation Name",
):
document._get_batch_process_metadata(
"projects//locations/us/operations/7890123"
)


def test_bigquery_column_name():
string_map = {
"Phone #:": "phone_num",
Expand Down