From 1e871f690dc5abfde748d7aef52001f8c406304f Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 10:28:23 -0500 Subject: [PATCH 01/13] minor cleanup in page.py - Fix Docstrings, remove extra unused property --- google/cloud/documentai_toolbox/wrappers/page.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index d224d1de..888fef2c 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -186,7 +186,7 @@ class Line(_BasePageElement): Required. The original object. text (str): Required. The text of the Line. - _tokens (List[Token]): + tokens (List[Token]): Optional. The Tokens contained within the Line. """ @@ -207,14 +207,10 @@ class Paragraph(_BasePageElement): Required. The original object. text (str): Required. The text of the Paragraph. - _lines (List[Line]): + lines (List[Line]): Optional. The Lines contained within the Paragraph. """ - _lines: Optional[List[Line]] = dataclasses.field( - init=False, repr=False, default=None - ) - @cached_property def lines(self): return cast( From 46fcb37b868ed7d4480dde5f2958be7cb97ed363 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 11:51:35 -0500 Subject: [PATCH 02/13] page.py Class restructuring for clarity --- .../cloud/documentai_toolbox/wrappers/page.py | 166 +++++++++--------- 1 file changed, 84 insertions(+), 82 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 888fef2c..dc4dbd3f 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -18,7 +18,7 @@ from abc import ABC import dataclasses from functools import cached_property -from typing import List, Optional, Type, cast +from typing import Iterable, List, Optional, Type, cast import pandas as pd @@ -44,16 +44,16 @@ class Table: _page: "Page" = dataclasses.field(repr=False) @cached_property - def body_rows(self): - return _table_rows_from_documentai_table_rows( - table_rows=list(self.documentai_object.body_rows), + def body_rows(self) -> List[List[str]]: + return Table._extract_table_rows( + table_rows=self.documentai_object.body_rows, text=self._page._document_text, ) @cached_property - def header_rows(self): - return _table_rows_from_documentai_table_rows( - table_rows=list(self.documentai_object.header_rows), + def header_rows(self) -> List[List[str]]: + return Table._extract_table_rows( + table_rows=self.documentai_object.header_rows, text=self._page._document_text, ) @@ -75,6 +75,31 @@ def to_dataframe(self) -> pd.DataFrame: return pd.DataFrame(self.body_rows, columns=columns) + @staticmethod + def _extract_table_rows( + table_rows: Iterable[documentai.Document.Page.Table.TableRow], text: str + ) -> List[List[str]]: + r"""Returns a list of rows from table_rows. + + Args: + table_rows (List[documentai.Document.Page.Table.TableRow]): + Required. A documentai.Document.Page.Table.TableRow. + text (str): + Required. UTF-8 encoded text in reading order + from the document. + + Returns: + List[List[str]]: + A list of table rows. + """ + return [ + [ + _text_from_layout(cell.layout, text).replace("\n", "") + for cell in row.cells + ] + for row in table_rows + ] + @dataclasses.dataclass class FormField: @@ -95,21 +120,35 @@ class FormField: _page: "Page" = dataclasses.field(repr=False) @cached_property - def field_name(self): - return _trim_text( + def field_name(self) -> str: + return FormField._trim_text( _text_from_layout( self.documentai_object.field_name, self._page._document_text ) ) @cached_property - def field_value(self): - return _trim_text( + def field_value(self) -> str: + return FormField._trim_text( _text_from_layout( self.documentai_object.field_value, self._page._document_text ) ) + @staticmethod + def _trim_text(text: str) -> str: + r"""Remove extra space characters from text (blank, newline, tab, etc.) + + Args: + text (str): + Required. UTF-8 encoded text in reading order + from the document. + Returns: + str: + Text without trailing spaces/newlines + """ + return text.strip().replace("\n", " ") + @dataclasses.dataclass class _BasePageElement(ABC): @@ -119,7 +158,7 @@ class _BasePageElement(ABC): _page: "Page" = dataclasses.field(repr=False) @cached_property - def text(self): + def text(self) -> str: """ Text of the page element. """ @@ -137,6 +176,35 @@ def hocr_bounding_box(self): page_dimension=self._page.documentai_object.dimension, ) + def _get_children_of_element( + self, children: List[ElementWithLayout] + ) -> List[ElementWithLayout]: + r"""Returns a list of children inside element. + + Args: + children (List[ElementWithLayout]): + Required. List of wrapped children. + + Returns: + List[ElementWithLayout]: + A list of wrapped children that are inside an element. + """ + start_index = self.documentai_object.layout.text_anchor.text_segments[ + 0 + ].start_index + end_index = self.documentai_object.layout.text_anchor.text_segments[0].end_index + + return [ + child + for child in children + if start_index + <= child.documentai_object.layout.text_anchor.text_segments[0].start_index + < end_index + and start_index + < child.documentai_object.layout.text_anchor.text_segments[0].end_index + <= end_index + ] + @dataclasses.dataclass class Symbol(_BasePageElement): @@ -173,7 +241,7 @@ class Token(_BasePageElement): def symbols(self): return cast( List[Symbol], - _get_children_of_element(self.documentai_object, self._page.symbols), + self._get_children_of_element(self._page.symbols), ) @@ -194,7 +262,7 @@ class Line(_BasePageElement): def tokens(self): return cast( List[Token], - _get_children_of_element(self.documentai_object, self._page.tokens), + self._get_children_of_element(self._page.tokens), ) @@ -215,7 +283,7 @@ class Paragraph(_BasePageElement): def lines(self): return cast( List[Line], - _get_children_of_element(self.documentai_object, self._page.lines), + self._get_children_of_element(self._page.lines), ) @@ -236,7 +304,7 @@ class Block(_BasePageElement): def paragraphs(self): return cast( List[Paragraph], - _get_children_of_element(self.documentai_object, self._page.paragraphs), + self._get_children_of_element(self._page.paragraphs), ) @@ -258,28 +326,6 @@ def hocr_bounding_box(self): return None -def _table_rows_from_documentai_table_rows( - table_rows: List[documentai.Document.Page.Table.TableRow], text: str -) -> List[List[str]]: - r"""Returns a list of rows from table_rows. - - Args: - table_rows (List[documentai.Document.Page.Table.TableRow]): - Required. A documentai.Document.Page.Table.TableRow. - text (str): - Required. UTF-8 encoded text in reading order - from the document. - - Returns: - List[List[str]]: - A list of table rows. - """ - return [ - [_text_from_layout(cell.layout, text).replace("\n", "") for cell in row.cells] - for row in table_rows - ] - - def _get_hocr_bounding_box( element_with_layout: ElementWithLayout, page_dimension: documentai.Document.Page.Dimension, @@ -334,50 +380,6 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str ) -def _get_children_of_element( - element: ElementWithLayout, children: List[ElementWithLayout] -) -> List[ElementWithLayout]: - r"""Returns a list of children inside element. - - Args: - element (ElementWithLayout): - Required. A element in a page. - children (List[ElementWithLayout]): - Required. List of wrapped children. - - Returns: - List[ElementWithLayout]: - A list of wrapped children that are inside a element. - """ - start_index = element.layout.text_anchor.text_segments[0].start_index - end_index = element.layout.text_anchor.text_segments[0].end_index - - return [ - child - for child in children - if start_index - <= child.documentai_object.layout.text_anchor.text_segments[0].start_index - < end_index - and start_index - < child.documentai_object.layout.text_anchor.text_segments[0].end_index - <= end_index - ] - - -def _trim_text(text: str) -> str: - r"""Remove extra space characters from text (blank, newline, tab, etc.) - - Args: - text (str): - Required. UTF-8 encoded text in reading order - from the document. - Returns: - str: - Text without trailing spaces/newlines - """ - return text.strip().replace("\n", " ") - - @dataclasses.dataclass class Page: """Represents a wrapped documentai.Document.Page . From 867b24e40dd0df65a621fbcbd49e0e75c8e2faa6 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 12:33:20 -0500 Subject: [PATCH 03/13] Further refactoring --- .../cloud/documentai_toolbox/wrappers/page.py | 129 ++++++++---------- 1 file changed, 55 insertions(+), 74 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index dc4dbd3f..26f227c2 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -18,7 +18,7 @@ from abc import ABC import dataclasses from functools import cached_property -from typing import Iterable, List, Optional, Type, cast +from typing import Iterable, List, Optional, Type import pandas as pd @@ -45,48 +45,38 @@ class Table: @cached_property def body_rows(self) -> List[List[str]]: - return Table._extract_table_rows( - table_rows=self.documentai_object.body_rows, - text=self._page._document_text, - ) + return self._extract_table_rows(self.documentai_object.body_rows) @cached_property def header_rows(self) -> List[List[str]]: - return Table._extract_table_rows( - table_rows=self.documentai_object.header_rows, - text=self._page._document_text, - ) + return self._extract_table_rows(self.documentai_object.header_rows) def to_dataframe(self) -> pd.DataFrame: - r"""Returns pd.DataFrame from documentai.table + """Returns pd.DataFrame from documentai.table Returns: pd.DataFrame: The DataFrame of the table. - """ if not self.body_rows: return pd.DataFrame(columns=self.header_rows) - if self.header_rows: - columns = pd.MultiIndex.from_arrays(self.header_rows) - else: - columns = [None] * len(self.body_rows[0]) + columns = ( + pd.MultiIndex.from_arrays(self.header_rows) + if self.header_rows + else [None] * len(self.body_rows[0]) + ) return pd.DataFrame(self.body_rows, columns=columns) - @staticmethod def _extract_table_rows( - table_rows: Iterable[documentai.Document.Page.Table.TableRow], text: str + self, table_rows: Iterable[documentai.Document.Page.Table.TableRow] ) -> List[List[str]]: - r"""Returns a list of rows from table_rows. + """Returns a list of rows from table_rows. Args: table_rows (List[documentai.Document.Page.Table.TableRow]): Required. A documentai.Document.Page.Table.TableRow. - text (str): - Required. UTF-8 encoded text in reading order - from the document. Returns: List[List[str]]: @@ -94,7 +84,9 @@ def _extract_table_rows( """ return [ [ - _text_from_layout(cell.layout, text).replace("\n", "") + _text_from_layout(cell.layout, self._page._document_text).replace( + "\n", "" + ) for cell in row.cells ] for row in table_rows @@ -121,28 +113,29 @@ class FormField: @cached_property def field_name(self) -> str: - return FormField._trim_text( + return self._trim_text( _text_from_layout( - self.documentai_object.field_name, self._page._document_text + self.documentai_object.field_name.layout, self._page._document_text ) ) @cached_property def field_value(self) -> str: - return FormField._trim_text( + return self._trim_text( _text_from_layout( - self.documentai_object.field_value, self._page._document_text + self.documentai_object.field_value.layout, self._page._document_text ) ) @staticmethod def _trim_text(text: str) -> str: - r"""Remove extra space characters from text (blank, newline, tab, etc.) + """Remove extra space characters from text (blank, newline, tab, etc.) Args: text (str): Required. UTF-8 encoded text in reading order from the document. + Returns: str: Text without trailing spaces/newlines @@ -163,46 +156,47 @@ def text(self) -> str: Text of the page element. """ return _text_from_layout( - layout=self.documentai_object.layout, text=self._page._document_text + self.documentai_object.layout, self._page._document_text ) @cached_property - def hocr_bounding_box(self): + def hocr_bounding_box(self) -> Optional[str]: """ hOCR bounding box of the page element. """ return _get_hocr_bounding_box( - element_with_layout=self.documentai_object, - page_dimension=self._page.documentai_object.dimension, + self.documentai_object, self._page.documentai_object.dimension ) + @cached_property + def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment: + """ + Element text section + """ + return self.documentai_object.layout.text_anchor.text_segments[0] + def _get_children_of_element( - self, children: List[ElementWithLayout] - ) -> List[ElementWithLayout]: - r"""Returns a list of children inside element. + self, potential_children: List["_BasePageElement"] + ) -> List["_BasePageElement"]: + """Returns a list of children inside element. Args: - children (List[ElementWithLayout]): + potential_children (List[_BasePageElement]): Required. List of wrapped children. Returns: - List[ElementWithLayout]: + List[_BasePageElement]: A list of wrapped children that are inside an element. """ - start_index = self.documentai_object.layout.text_anchor.text_segments[ - 0 - ].start_index - end_index = self.documentai_object.layout.text_anchor.text_segments[0].end_index - return [ child - for child in children - if start_index - <= child.documentai_object.layout.text_anchor.text_segments[0].start_index - < end_index - and start_index - < child.documentai_object.layout.text_anchor.text_segments[0].end_index - <= end_index + for child in potential_children + if self._text_segment.start_index + <= child._text_segment.start_index + < self._text_segment.end_index + and self._text_segment.start_index + < child._text_segment.end_index + <= self._text_segment.end_index ] @@ -219,7 +213,7 @@ class Symbol(_BasePageElement): """ @cached_property - def hocr_bounding_box(self): + def hocr_bounding_box(self) -> Optional[str]: # Symbols are not represented in hOCR return None @@ -238,11 +232,8 @@ class Token(_BasePageElement): """ @cached_property - def symbols(self): - return cast( - List[Symbol], - self._get_children_of_element(self._page.symbols), - ) + def symbols(self) -> List[Symbol]: + return self._get_children_of_element(self._page.symbols) @dataclasses.dataclass @@ -259,11 +250,8 @@ class Line(_BasePageElement): """ @cached_property - def tokens(self): - return cast( - List[Token], - self._get_children_of_element(self._page.tokens), - ) + def tokens(self) -> List[Token]: + return self._get_children_of_element(self._page.tokens) @dataclasses.dataclass @@ -280,11 +268,8 @@ class Paragraph(_BasePageElement): """ @cached_property - def lines(self): - return cast( - List[Line], - self._get_children_of_element(self._page.lines), - ) + def lines(self) -> List[Line]: + return self._get_children_of_element(self._page.lines) @dataclasses.dataclass @@ -296,16 +281,13 @@ class Block(_BasePageElement): Required. The original object. text (str): Required. The text of the Block. - _paragraphs (List[Paragraph]): + paragraphs (List[Paragraph]): Optional. The Paragraphs contained within the Block. """ @cached_property - def paragraphs(self): - return cast( - List[Paragraph], - self._get_children_of_element(self._page.paragraphs), - ) + def paragraphs(self) -> List[Paragraph]: + return self._get_children_of_element(self._page.paragraphs) @dataclasses.dataclass @@ -330,7 +312,7 @@ def _get_hocr_bounding_box( element_with_layout: ElementWithLayout, page_dimension: documentai.Document.Page.Dimension, ) -> Optional[str]: - r"""Returns a hOCR bounding box string. + """Returns a hOCR bounding box string. Args: element_with_layout (ElementWithLayout): @@ -340,7 +322,7 @@ def _get_hocr_bounding_box( Returns: Optional[str]: - hOCR bounding box sring. + hOCR bounding box string. """ if not element_with_layout.layout.bounding_poly: return None @@ -483,6 +465,5 @@ def blocks(self): @cached_property def hocr_bounding_box(self): return _get_hocr_bounding_box( - element_with_layout=self.documentai_object, - page_dimension=self.documentai_object.dimension, + self.documentai_object, self.documentai_object.dimension ) From 3879dfaf96c364835acf36d3403cda59ab966461 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 12:46:21 -0500 Subject: [PATCH 04/13] Fix Typo in FormField class --- google/cloud/documentai_toolbox/wrappers/page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 26f227c2..92e5c519 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -115,7 +115,7 @@ class FormField: def field_name(self) -> str: return self._trim_text( _text_from_layout( - self.documentai_object.field_name.layout, self._page._document_text + self.documentai_object.field_name, self._page._document_text ) ) @@ -123,7 +123,7 @@ def field_name(self) -> str: def field_value(self) -> str: return self._trim_text( _text_from_layout( - self.documentai_object.field_value.layout, self._page._document_text + self.documentai_object.field_value, self._page._document_text ) ) From ea76362a66ce861da6b6f8ee3c119773af6f0614 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 12:48:31 -0500 Subject: [PATCH 05/13] Improve docstring for text_segment --- google/cloud/documentai_toolbox/wrappers/page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 92e5c519..4664a049 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -171,7 +171,7 @@ def hocr_bounding_box(self) -> Optional[str]: @cached_property def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment: """ - Element text section + Page element text segment. """ return self.documentai_object.layout.text_anchor.text_segments[0] From 03f7621f513ca780a3f41680c3f7c155516f36c0 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 12:50:48 -0500 Subject: [PATCH 06/13] Move `_trim_text` back --- .../cloud/documentai_toolbox/wrappers/page.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 4664a049..69203809 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -113,7 +113,7 @@ class FormField: @cached_property def field_name(self) -> str: - return self._trim_text( + return _trim_text( _text_from_layout( self.documentai_object.field_name, self._page._document_text ) @@ -121,26 +121,26 @@ def field_name(self) -> str: @cached_property def field_value(self) -> str: - return self._trim_text( + return _trim_text( _text_from_layout( self.documentai_object.field_value, self._page._document_text ) ) - @staticmethod - def _trim_text(text: str) -> str: - """Remove extra space characters from text (blank, newline, tab, etc.) - Args: - text (str): - Required. UTF-8 encoded text in reading order - from the document. +def _trim_text(text: str) -> str: + """Remove extra space characters from text (blank, newline, tab, etc.) - Returns: - str: - Text without trailing spaces/newlines - """ - return text.strip().replace("\n", " ") + Args: + text (str): + Required. UTF-8 encoded text in reading order + from the document. + + Returns: + str: + Text without trailing spaces/newlines + """ + return text.strip().replace("\n", " ") @dataclasses.dataclass From 322612e241ee74bbd6d266aa42bef6fd7dbb7404 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Thu, 13 Jun 2024 13:20:40 -0500 Subject: [PATCH 07/13] Add local variables and stop early to improve efficiency further --- .../cloud/documentai_toolbox/wrappers/page.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 69203809..64f22316 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -188,16 +188,22 @@ def _get_children_of_element( List[_BasePageElement]: A list of wrapped children that are inside an element. """ - return [ - child - for child in potential_children - if self._text_segment.start_index - <= child._text_segment.start_index - < self._text_segment.end_index - and self._text_segment.start_index - < child._text_segment.end_index - <= self._text_segment.end_index - ] + start_index = self._text_segment.start_index + end_index = self._text_segment.end_index + + children = [] + for child in potential_children: + child_start_index = child._text_segment.start_index + child_end_index = child._text_segment.end_index + + if child_start_index >= end_index: + break + if ( + start_index <= child_start_index < end_index + and start_index < child_end_index <= end_index + ): + children.append(child) + return children @dataclasses.dataclass @@ -344,7 +350,7 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str Args: layout (documentai.Document.Page.Layout): - Required. an element with layout fields. + Required. An element with layout fields. text (str): Required. UTF-8 encoded text in reading order of the `documentai.Document` containing the layout element. @@ -353,6 +359,8 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str str: Text from a single element. """ + if not layout.text_anchor or not layout.text_anchor.text_segments: + return "" # Note: `layout.text_anchor.text_segments` are indexes into the full Document text. # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#textsegment From 2e39a0a3f05bf52c1022e8ebf7985cdd2eec1b2f Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 1 Jul 2024 12:03:57 -0500 Subject: [PATCH 08/13] Updated Docstring for `_get_children_of_element` --- .../cloud/documentai_toolbox/wrappers/page.py | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 64f22316..eef355d4 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -178,15 +178,37 @@ def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment: def _get_children_of_element( self, potential_children: List["_BasePageElement"] ) -> List["_BasePageElement"]: - """Returns a list of children inside element. + """ + Filters potential child elements to identify only those fully contained within this element. + + This method iterates through a list of potential child elements, checking if their + start and end indices fall completely within the start and end indices of this element. + Elements that are only partially contained or entirely outside this element's range are excluded. Args: potential_children (List[_BasePageElement]): - Required. List of wrapped children. + Required. A list of wrapped page elements (e.g., words, lines, paragraphs) + that could potentially be children of this element. Returns: List[_BasePageElement]: - A list of wrapped children that are inside an element. + A new list containing only the wrapped page elements that are fully + contained within this element, maintaining their original order. + + Raises: + TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`. + + Example: + ``` + page_element = PageElement(text_segment=TextSegment(0, 100)) + potential_children = [ + PageElement(text_segment=TextSegment(10, 20)), # Inside + PageElement(text_segment=TextSegment(5, 105)), # Overlapping + PageElement(text_segment=TextSegment(120, 150)) # Outside + ] + children = page_element._get_children_of_element(potential_children) + # children will contain only the first PageElement + ``` """ start_index = self._text_segment.start_index end_index = self._text_segment.end_index @@ -197,7 +219,7 @@ def _get_children_of_element( child_end_index = child._text_segment.end_index if child_start_index >= end_index: - break + break # Optimization: stop early if child is beyond the end of this element if ( start_index <= child_start_index < end_index and start_index < child_end_index <= end_index From e0dd0bee2a6dd79383008258e2e04de045ebb5de Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 1 Jul 2024 12:08:07 -0500 Subject: [PATCH 09/13] Added code comment for reason of cached_property --- google/cloud/documentai_toolbox/wrappers/page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index eef355d4..5e985a53 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -168,6 +168,8 @@ def hocr_bounding_box(self) -> Optional[str]: self.documentai_object, self._page.documentai_object.dimension ) + # This field is a cached property to improve export times for hOCR + # as outlined in https://github.com/googleapis/python-documentai-toolbox/issues/312 @cached_property def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment: """ From ba33c99ad3b74744ac108251781234a1ad91ae62 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 1 Jul 2024 12:11:24 -0500 Subject: [PATCH 10/13] Attempt to fix docstring formatting --- google/cloud/documentai_toolbox/wrappers/page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 5e985a53..add192f9 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -201,6 +201,7 @@ def _get_children_of_element( TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`. Example: + ``` page_element = PageElement(text_segment=TextSegment(0, 100)) potential_children = [ @@ -211,6 +212,7 @@ def _get_children_of_element( children = page_element._get_children_of_element(potential_children) # children will contain only the first PageElement ``` + """ start_index = self._text_segment.start_index end_index = self._text_segment.end_index From 46aefcc46e17c852e539fb42e33cdc7d2afcabb1 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 1 Jul 2024 12:23:44 -0500 Subject: [PATCH 11/13] Remove example code from `_get_children_of_element` docstring --- .../cloud/documentai_toolbox/wrappers/page.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index add192f9..144c8d31 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -201,18 +201,16 @@ def _get_children_of_element( TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`. Example: - - ``` - page_element = PageElement(text_segment=TextSegment(0, 100)) - potential_children = [ - PageElement(text_segment=TextSegment(10, 20)), # Inside - PageElement(text_segment=TextSegment(5, 105)), # Overlapping - PageElement(text_segment=TextSegment(120, 150)) # Outside - ] - children = page_element._get_children_of_element(potential_children) - # children will contain only the first PageElement - ``` - + ``` + page_element = PageElement(text_segment=TextSegment(0, 100)) + potential_children = [ + PageElement(text_segment=TextSegment(10, 20)), # Inside + PageElement(text_segment=TextSegment(5, 105)), # Overlapping + PageElement(text_segment=TextSegment(120, 150)) # Outside + ] + children = page_element._get_children_of_element(potential_children) + # children will contain only the first PageElement + ``` """ start_index = self._text_segment.start_index end_index = self._text_segment.end_index From aeebb80d1ee3952d96f54fc48ac755e157072b07 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 1 Jul 2024 12:26:59 -0500 Subject: [PATCH 12/13] Remove Raises from docstring --- google/cloud/documentai_toolbox/wrappers/page.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 144c8d31..046e1411 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -196,21 +196,6 @@ def _get_children_of_element( List[_BasePageElement]: A new list containing only the wrapped page elements that are fully contained within this element, maintaining their original order. - - Raises: - TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`. - - Example: - ``` - page_element = PageElement(text_segment=TextSegment(0, 100)) - potential_children = [ - PageElement(text_segment=TextSegment(10, 20)), # Inside - PageElement(text_segment=TextSegment(5, 105)), # Overlapping - PageElement(text_segment=TextSegment(120, 150)) # Outside - ] - children = page_element._get_children_of_element(potential_children) - # children will contain only the first PageElement - ``` """ start_index = self._text_segment.start_index end_index = self._text_segment.end_index From 0faba3c26e7cf6f389ee1892b6e8de24ed0792e9 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Tue, 2 Jul 2024 15:24:09 -0500 Subject: [PATCH 13/13] Added code comments for Review comments --- google/cloud/documentai_toolbox/wrappers/page.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 046e1411..35a2491e 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -84,6 +84,7 @@ def _extract_table_rows( """ return [ [ + # Newlines removed to improve formatting for export formats. _text_from_layout(cell.layout, self._page._document_text).replace( "\n", "" ) @@ -140,6 +141,8 @@ def _trim_text(text: str) -> str: str: Text without trailing spaces/newlines """ + # Newline replacement added to correct common + # misshapen output from Form Parser. return text.strip().replace("\n", " ")