From 1e871f690dc5abfde748d7aef52001f8c406304f Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 10:28:23 -0500
Subject: [PATCH 01/13] minor cleanup in page.py

- Fix Docstrings, remove extra unused property
---
 google/cloud/documentai_toolbox/wrappers/page.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index d224d1de..888fef2c 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -186,7 +186,7 @@ class Line(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Line.
-        _tokens (List[Token]):
+        tokens (List[Token]):
             Optional. The Tokens contained within the Line.
     """
 
@@ -207,14 +207,10 @@ class Paragraph(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Paragraph.
-        _lines (List[Line]):
+        lines (List[Line]):
             Optional. The Lines contained within the Paragraph.
     """
 
-    _lines: Optional[List[Line]] = dataclasses.field(
-        init=False, repr=False, default=None
-    )
-
     @cached_property
     def lines(self):
         return cast(

From 46fcb37b868ed7d4480dde5f2958be7cb97ed363 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 11:51:35 -0500
Subject: [PATCH 02/13] page.py Class restructuring for clarity

---
 .../cloud/documentai_toolbox/wrappers/page.py | 166 +++++++++---------
 1 file changed, 84 insertions(+), 82 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 888fef2c..dc4dbd3f 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -18,7 +18,7 @@
 from abc import ABC
 import dataclasses
 from functools import cached_property
-from typing import List, Optional, Type, cast
+from typing import Iterable, List, Optional, Type, cast
 
 import pandas as pd
 
@@ -44,16 +44,16 @@ class Table:
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def body_rows(self):
-        return _table_rows_from_documentai_table_rows(
-            table_rows=list(self.documentai_object.body_rows),
+    def body_rows(self) -> List[List[str]]:
+        return Table._extract_table_rows(
+            table_rows=self.documentai_object.body_rows,
             text=self._page._document_text,
         )
 
     @cached_property
-    def header_rows(self):
-        return _table_rows_from_documentai_table_rows(
-            table_rows=list(self.documentai_object.header_rows),
+    def header_rows(self) -> List[List[str]]:
+        return Table._extract_table_rows(
+            table_rows=self.documentai_object.header_rows,
             text=self._page._document_text,
         )
 
@@ -75,6 +75,31 @@ def to_dataframe(self) -> pd.DataFrame:
 
         return pd.DataFrame(self.body_rows, columns=columns)
 
+    @staticmethod
+    def _extract_table_rows(
+        table_rows: Iterable[documentai.Document.Page.Table.TableRow], text: str
+    ) -> List[List[str]]:
+        r"""Returns a list of rows from table_rows.
+
+        Args:
+            table_rows (List[documentai.Document.Page.Table.TableRow]):
+                Required. A documentai.Document.Page.Table.TableRow.
+            text (str):
+                Required. UTF-8 encoded text in reading order
+                from the document.
+
+        Returns:
+            List[List[str]]:
+                A list of table rows.
+        """
+        return [
+            [
+                _text_from_layout(cell.layout, text).replace("\n", "")
+                for cell in row.cells
+            ]
+            for row in table_rows
+        ]
+
 
 @dataclasses.dataclass
 class FormField:
@@ -95,21 +120,35 @@ class FormField:
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def field_name(self):
-        return _trim_text(
+    def field_name(self) -> str:
+        return FormField._trim_text(
             _text_from_layout(
                 self.documentai_object.field_name, self._page._document_text
             )
         )
 
     @cached_property
-    def field_value(self):
-        return _trim_text(
+    def field_value(self) -> str:
+        return FormField._trim_text(
             _text_from_layout(
                 self.documentai_object.field_value, self._page._document_text
             )
         )
 
+    @staticmethod
+    def _trim_text(text: str) -> str:
+        r"""Remove extra space characters from text (blank, newline, tab, etc.)
+
+        Args:
+            text (str):
+                Required. UTF-8 encoded text in reading order
+                from the document.
+        Returns:
+            str:
+                Text without trailing spaces/newlines
+        """
+        return text.strip().replace("\n", " ")
+
 
 @dataclasses.dataclass
 class _BasePageElement(ABC):
@@ -119,7 +158,7 @@ class _BasePageElement(ABC):
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def text(self):
+    def text(self) -> str:
         """
         Text of the page element.
         """
@@ -137,6 +176,35 @@ def hocr_bounding_box(self):
             page_dimension=self._page.documentai_object.dimension,
         )
 
+    def _get_children_of_element(
+        self, children: List[ElementWithLayout]
+    ) -> List[ElementWithLayout]:
+        r"""Returns a list of children inside element.
+
+        Args:
+            children (List[ElementWithLayout]):
+                Required. List of wrapped children.
+
+        Returns:
+            List[ElementWithLayout]:
+                A list of wrapped children that are inside an element.
+        """
+        start_index = self.documentai_object.layout.text_anchor.text_segments[
+            0
+        ].start_index
+        end_index = self.documentai_object.layout.text_anchor.text_segments[0].end_index
+
+        return [
+            child
+            for child in children
+            if start_index
+            <= child.documentai_object.layout.text_anchor.text_segments[0].start_index
+            < end_index
+            and start_index
+            < child.documentai_object.layout.text_anchor.text_segments[0].end_index
+            <= end_index
+        ]
+
 
 @dataclasses.dataclass
 class Symbol(_BasePageElement):
@@ -173,7 +241,7 @@ class Token(_BasePageElement):
     def symbols(self):
         return cast(
             List[Symbol],
-            _get_children_of_element(self.documentai_object, self._page.symbols),
+            self._get_children_of_element(self._page.symbols),
         )
 
 
@@ -194,7 +262,7 @@ class Line(_BasePageElement):
     def tokens(self):
         return cast(
             List[Token],
-            _get_children_of_element(self.documentai_object, self._page.tokens),
+            self._get_children_of_element(self._page.tokens),
         )
 
 
@@ -215,7 +283,7 @@ class Paragraph(_BasePageElement):
     def lines(self):
         return cast(
             List[Line],
-            _get_children_of_element(self.documentai_object, self._page.lines),
+            self._get_children_of_element(self._page.lines),
         )
 
 
@@ -236,7 +304,7 @@ class Block(_BasePageElement):
     def paragraphs(self):
         return cast(
             List[Paragraph],
-            _get_children_of_element(self.documentai_object, self._page.paragraphs),
+            self._get_children_of_element(self._page.paragraphs),
         )
 
 
@@ -258,28 +326,6 @@ def hocr_bounding_box(self):
         return None
 
 
-def _table_rows_from_documentai_table_rows(
-    table_rows: List[documentai.Document.Page.Table.TableRow], text: str
-) -> List[List[str]]:
-    r"""Returns a list of rows from table_rows.
-
-    Args:
-        table_rows (List[documentai.Document.Page.Table.TableRow]):
-            Required. A documentai.Document.Page.Table.TableRow.
-        text (str):
-            Required. UTF-8 encoded text in reading order
-            from the document.
-
-    Returns:
-        List[List[str]]:
-            A list of table rows.
-    """
-    return [
-        [_text_from_layout(cell.layout, text).replace("\n", "") for cell in row.cells]
-        for row in table_rows
-    ]
-
-
 def _get_hocr_bounding_box(
     element_with_layout: ElementWithLayout,
     page_dimension: documentai.Document.Page.Dimension,
@@ -334,50 +380,6 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
     )
 
 
-def _get_children_of_element(
-    element: ElementWithLayout, children: List[ElementWithLayout]
-) -> List[ElementWithLayout]:
-    r"""Returns a list of children inside element.
-
-    Args:
-        element (ElementWithLayout):
-            Required. A element in a page.
-        children (List[ElementWithLayout]):
-            Required. List of wrapped children.
-
-    Returns:
-        List[ElementWithLayout]:
-            A list of wrapped children that are inside a element.
-    """
-    start_index = element.layout.text_anchor.text_segments[0].start_index
-    end_index = element.layout.text_anchor.text_segments[0].end_index
-
-    return [
-        child
-        for child in children
-        if start_index
-        <= child.documentai_object.layout.text_anchor.text_segments[0].start_index
-        < end_index
-        and start_index
-        < child.documentai_object.layout.text_anchor.text_segments[0].end_index
-        <= end_index
-    ]
-
-
-def _trim_text(text: str) -> str:
-    r"""Remove extra space characters from text (blank, newline, tab, etc.)
-
-    Args:
-        text (str):
-            Required. UTF-8 encoded text in reading order
-            from the document.
-    Returns:
-        str:
-            Text without trailing spaces/newlines
-    """
-    return text.strip().replace("\n", " ")
-
-
 @dataclasses.dataclass
 class Page:
     """Represents a wrapped documentai.Document.Page .

From 867b24e40dd0df65a621fbcbd49e0e75c8e2faa6 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 12:33:20 -0500
Subject: [PATCH 03/13] Further refactoring

---
 .../cloud/documentai_toolbox/wrappers/page.py | 129 ++++++++----------
 1 file changed, 55 insertions(+), 74 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index dc4dbd3f..26f227c2 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -18,7 +18,7 @@
 from abc import ABC
 import dataclasses
 from functools import cached_property
-from typing import Iterable, List, Optional, Type, cast
+from typing import Iterable, List, Optional, Type
 
 import pandas as pd
 
@@ -45,48 +45,38 @@ class Table:
 
     @cached_property
     def body_rows(self) -> List[List[str]]:
-        return Table._extract_table_rows(
-            table_rows=self.documentai_object.body_rows,
-            text=self._page._document_text,
-        )
+        return self._extract_table_rows(self.documentai_object.body_rows)
 
     @cached_property
     def header_rows(self) -> List[List[str]]:
-        return Table._extract_table_rows(
-            table_rows=self.documentai_object.header_rows,
-            text=self._page._document_text,
-        )
+        return self._extract_table_rows(self.documentai_object.header_rows)
 
     def to_dataframe(self) -> pd.DataFrame:
-        r"""Returns pd.DataFrame from documentai.table
+        """Returns pd.DataFrame from documentai.table
 
         Returns:
             pd.DataFrame:
                 The DataFrame of the table.
-
         """
         if not self.body_rows:
             return pd.DataFrame(columns=self.header_rows)
 
-        if self.header_rows:
-            columns = pd.MultiIndex.from_arrays(self.header_rows)
-        else:
-            columns = [None] * len(self.body_rows[0])
+        columns = (
+            pd.MultiIndex.from_arrays(self.header_rows)
+            if self.header_rows
+            else [None] * len(self.body_rows[0])
+        )
 
         return pd.DataFrame(self.body_rows, columns=columns)
 
-    @staticmethod
     def _extract_table_rows(
-        table_rows: Iterable[documentai.Document.Page.Table.TableRow], text: str
+        self, table_rows: Iterable[documentai.Document.Page.Table.TableRow]
     ) -> List[List[str]]:
-        r"""Returns a list of rows from table_rows.
+        """Returns a list of rows from table_rows.
 
         Args:
             table_rows (List[documentai.Document.Page.Table.TableRow]):
                 Required. A documentai.Document.Page.Table.TableRow.
-            text (str):
-                Required. UTF-8 encoded text in reading order
-                from the document.
 
         Returns:
             List[List[str]]:
@@ -94,7 +84,9 @@ def _extract_table_rows(
         """
         return [
             [
-                _text_from_layout(cell.layout, text).replace("\n", "")
+                _text_from_layout(cell.layout, self._page._document_text).replace(
+                    "\n", ""
+                )
                 for cell in row.cells
             ]
             for row in table_rows
@@ -121,28 +113,29 @@ class FormField:
 
     @cached_property
     def field_name(self) -> str:
-        return FormField._trim_text(
+        return self._trim_text(
             _text_from_layout(
-                self.documentai_object.field_name, self._page._document_text
+                self.documentai_object.field_name.layout, self._page._document_text
             )
         )
 
     @cached_property
     def field_value(self) -> str:
-        return FormField._trim_text(
+        return self._trim_text(
             _text_from_layout(
-                self.documentai_object.field_value, self._page._document_text
+                self.documentai_object.field_value.layout, self._page._document_text
             )
         )
 
     @staticmethod
     def _trim_text(text: str) -> str:
-        r"""Remove extra space characters from text (blank, newline, tab, etc.)
+        """Remove extra space characters from text (blank, newline, tab, etc.)
 
         Args:
             text (str):
                 Required. UTF-8 encoded text in reading order
                 from the document.
+
         Returns:
             str:
                 Text without trailing spaces/newlines
@@ -163,46 +156,47 @@ def text(self) -> str:
         Text of the page element.
         """
         return _text_from_layout(
-            layout=self.documentai_object.layout, text=self._page._document_text
+            self.documentai_object.layout, self._page._document_text
         )
 
     @cached_property
-    def hocr_bounding_box(self):
+    def hocr_bounding_box(self) -> Optional[str]:
         """
         hOCR bounding box of the page element.
         """
         return _get_hocr_bounding_box(
-            element_with_layout=self.documentai_object,
-            page_dimension=self._page.documentai_object.dimension,
+            self.documentai_object, self._page.documentai_object.dimension
         )
 
+    @cached_property
+    def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
+        """
+        Element text section
+        """
+        return self.documentai_object.layout.text_anchor.text_segments[0]
+
     def _get_children_of_element(
-        self, children: List[ElementWithLayout]
-    ) -> List[ElementWithLayout]:
-        r"""Returns a list of children inside element.
+        self, potential_children: List["_BasePageElement"]
+    ) -> List["_BasePageElement"]:
+        """Returns a list of children inside element.
 
         Args:
-            children (List[ElementWithLayout]):
+            potential_children (List[_BasePageElement]):
                 Required. List of wrapped children.
 
         Returns:
-            List[ElementWithLayout]:
+            List[_BasePageElement]:
                 A list of wrapped children that are inside an element.
         """
-        start_index = self.documentai_object.layout.text_anchor.text_segments[
-            0
-        ].start_index
-        end_index = self.documentai_object.layout.text_anchor.text_segments[0].end_index
-
         return [
             child
-            for child in children
-            if start_index
-            <= child.documentai_object.layout.text_anchor.text_segments[0].start_index
-            < end_index
-            and start_index
-            < child.documentai_object.layout.text_anchor.text_segments[0].end_index
-            <= end_index
+            for child in potential_children
+            if self._text_segment.start_index
+            <= child._text_segment.start_index
+            < self._text_segment.end_index
+            and self._text_segment.start_index
+            < child._text_segment.end_index
+            <= self._text_segment.end_index
         ]
 
 
@@ -219,7 +213,7 @@ class Symbol(_BasePageElement):
     """
 
     @cached_property
-    def hocr_bounding_box(self):
+    def hocr_bounding_box(self) -> Optional[str]:
         # Symbols are not represented in hOCR
         return None
 
@@ -238,11 +232,8 @@ class Token(_BasePageElement):
     """
 
     @cached_property
-    def symbols(self):
-        return cast(
-            List[Symbol],
-            self._get_children_of_element(self._page.symbols),
-        )
+    def symbols(self) -> List[Symbol]:
+        return self._get_children_of_element(self._page.symbols)
 
 
 @dataclasses.dataclass
@@ -259,11 +250,8 @@ class Line(_BasePageElement):
     """
 
     @cached_property
-    def tokens(self):
-        return cast(
-            List[Token],
-            self._get_children_of_element(self._page.tokens),
-        )
+    def tokens(self) -> List[Token]:
+        return self._get_children_of_element(self._page.tokens)
 
 
 @dataclasses.dataclass
@@ -280,11 +268,8 @@ class Paragraph(_BasePageElement):
     """
 
     @cached_property
-    def lines(self):
-        return cast(
-            List[Line],
-            self._get_children_of_element(self._page.lines),
-        )
+    def lines(self) -> List[Line]:
+        return self._get_children_of_element(self._page.lines)
 
 
 @dataclasses.dataclass
@@ -296,16 +281,13 @@ class Block(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Block.
-        _paragraphs (List[Paragraph]):
+        paragraphs (List[Paragraph]):
             Optional. The Paragraphs contained within the Block.
     """
 
     @cached_property
-    def paragraphs(self):
-        return cast(
-            List[Paragraph],
-            self._get_children_of_element(self._page.paragraphs),
-        )
+    def paragraphs(self) -> List[Paragraph]:
+        return self._get_children_of_element(self._page.paragraphs)
 
 
 @dataclasses.dataclass
@@ -330,7 +312,7 @@ def _get_hocr_bounding_box(
     element_with_layout: ElementWithLayout,
     page_dimension: documentai.Document.Page.Dimension,
 ) -> Optional[str]:
-    r"""Returns a hOCR bounding box string.
+    """Returns a hOCR bounding box string.
 
     Args:
         element_with_layout (ElementWithLayout):
@@ -340,7 +322,7 @@ def _get_hocr_bounding_box(
 
     Returns:
         Optional[str]:
-            hOCR bounding box sring.
+            hOCR bounding box string.
     """
     if not element_with_layout.layout.bounding_poly:
         return None
@@ -483,6 +465,5 @@ def blocks(self):
     @cached_property
     def hocr_bounding_box(self):
         return _get_hocr_bounding_box(
-            element_with_layout=self.documentai_object,
-            page_dimension=self.documentai_object.dimension,
+            self.documentai_object, self.documentai_object.dimension
         )

From 3879dfaf96c364835acf36d3403cda59ab966461 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 12:46:21 -0500
Subject: [PATCH 04/13] Fix Typo in FormField class

---
 google/cloud/documentai_toolbox/wrappers/page.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 26f227c2..92e5c519 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -115,7 +115,7 @@ class FormField:
     def field_name(self) -> str:
         return self._trim_text(
             _text_from_layout(
-                self.documentai_object.field_name.layout, self._page._document_text
+                self.documentai_object.field_name, self._page._document_text
             )
         )
 
@@ -123,7 +123,7 @@ def field_name(self) -> str:
     def field_value(self) -> str:
         return self._trim_text(
             _text_from_layout(
-                self.documentai_object.field_value.layout, self._page._document_text
+                self.documentai_object.field_value, self._page._document_text
             )
         )
 

From ea76362a66ce861da6b6f8ee3c119773af6f0614 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 12:48:31 -0500
Subject: [PATCH 05/13] Improve docstring for text_segment

---
 google/cloud/documentai_toolbox/wrappers/page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 92e5c519..4664a049 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -171,7 +171,7 @@ def hocr_bounding_box(self) -> Optional[str]:
     @cached_property
     def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
         """
-        Element text section
+        Page element text segment.
         """
         return self.documentai_object.layout.text_anchor.text_segments[0]
 

From 03f7621f513ca780a3f41680c3f7c155516f36c0 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 12:50:48 -0500
Subject: [PATCH 06/13] Move `_trim_text` back

---
 .../cloud/documentai_toolbox/wrappers/page.py | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 4664a049..69203809 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -113,7 +113,7 @@ class FormField:
 
     @cached_property
     def field_name(self) -> str:
-        return self._trim_text(
+        return _trim_text(
             _text_from_layout(
                 self.documentai_object.field_name, self._page._document_text
             )
@@ -121,26 +121,26 @@ def field_name(self) -> str:
 
     @cached_property
     def field_value(self) -> str:
-        return self._trim_text(
+        return _trim_text(
             _text_from_layout(
                 self.documentai_object.field_value, self._page._document_text
             )
         )
 
-    @staticmethod
-    def _trim_text(text: str) -> str:
-        """Remove extra space characters from text (blank, newline, tab, etc.)
 
-        Args:
-            text (str):
-                Required. UTF-8 encoded text in reading order
-                from the document.
+def _trim_text(text: str) -> str:
+    """Remove extra space characters from text (blank, newline, tab, etc.)
 
-        Returns:
-            str:
-                Text without trailing spaces/newlines
-        """
-        return text.strip().replace("\n", " ")
+    Args:
+        text (str):
+            Required. UTF-8 encoded text in reading order
+            from the document.
+
+    Returns:
+        str:
+            Text without trailing spaces/newlines
+    """
+    return text.strip().replace("\n", " ")
 
 
 @dataclasses.dataclass

From 322612e241ee74bbd6d266aa42bef6fd7dbb7404 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Thu, 13 Jun 2024 13:20:40 -0500
Subject: [PATCH 07/13] Add local variables and stop early to improve
 efficiency further

---
 .../cloud/documentai_toolbox/wrappers/page.py | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 69203809..64f22316 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -188,16 +188,22 @@ def _get_children_of_element(
             List[_BasePageElement]:
                 A list of wrapped children that are inside an element.
         """
-        return [
-            child
-            for child in potential_children
-            if self._text_segment.start_index
-            <= child._text_segment.start_index
-            < self._text_segment.end_index
-            and self._text_segment.start_index
-            < child._text_segment.end_index
-            <= self._text_segment.end_index
-        ]
+        start_index = self._text_segment.start_index
+        end_index = self._text_segment.end_index
+
+        children = []
+        for child in potential_children:
+            child_start_index = child._text_segment.start_index
+            child_end_index = child._text_segment.end_index
+
+            if child_start_index >= end_index:
+                break
+            if (
+                start_index <= child_start_index < end_index
+                and start_index < child_end_index <= end_index
+            ):
+                children.append(child)
+        return children
 
 
 @dataclasses.dataclass
@@ -344,7 +350,7 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
 
     Args:
         layout (documentai.Document.Page.Layout):
-            Required. an element with layout fields.
+            Required. An element with layout fields.
         text (str):
             Required. UTF-8 encoded text in reading order
             of the `documentai.Document` containing the layout element.
@@ -353,6 +359,8 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
         str:
             Text from a single element.
     """
+    if not layout.text_anchor or not layout.text_anchor.text_segments:
+        return ""
 
     # Note: `layout.text_anchor.text_segments` are indexes into the full Document text.
     # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#textsegment

From 2e39a0a3f05bf52c1022e8ebf7985cdd2eec1b2f Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Mon, 1 Jul 2024 12:03:57 -0500
Subject: [PATCH 08/13] Updated Docstring for `_get_children_of_element`

---
 .../cloud/documentai_toolbox/wrappers/page.py | 30 ++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 64f22316..eef355d4 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -178,15 +178,37 @@ def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
     def _get_children_of_element(
         self, potential_children: List["_BasePageElement"]
     ) -> List["_BasePageElement"]:
-        """Returns a list of children inside element.
+        """
+        Filters potential child elements to identify only those fully contained within this element.
+
+        This method iterates through a list of potential child elements, checking if their
+        start and end indices fall completely within the start and end indices of this element.
+        Elements that are only partially contained or entirely outside this element's range are excluded.
 
         Args:
             potential_children (List[_BasePageElement]):
-                Required. List of wrapped children.
+                Required. A list of wrapped page elements (e.g., words, lines, paragraphs)
+                that could potentially be children of this element.
 
         Returns:
             List[_BasePageElement]:
-                A list of wrapped children that are inside an element.
+                A new list containing only the wrapped page elements that are fully
+                contained within this element, maintaining their original order.
+
+        Raises:
+            TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`.
+
+        Example:
+            ```
+            page_element = PageElement(text_segment=TextSegment(0, 100))
+            potential_children = [
+                PageElement(text_segment=TextSegment(10, 20)),  # Inside
+                PageElement(text_segment=TextSegment(5, 105)),  # Overlapping
+                PageElement(text_segment=TextSegment(120, 150))  # Outside
+            ]
+            children = page_element._get_children_of_element(potential_children)
+            # children will contain only the first PageElement
+            ```
         """
         start_index = self._text_segment.start_index
         end_index = self._text_segment.end_index
@@ -197,7 +219,7 @@ def _get_children_of_element(
             child_end_index = child._text_segment.end_index
 
             if child_start_index >= end_index:
-                break
+                break  # Optimization: stop early if child is beyond the end of this element
             if (
                 start_index <= child_start_index < end_index
                 and start_index < child_end_index <= end_index

From e0dd0bee2a6dd79383008258e2e04de045ebb5de Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Mon, 1 Jul 2024 12:08:07 -0500
Subject: [PATCH 09/13] Added code comment for reason of cached_property

---
 google/cloud/documentai_toolbox/wrappers/page.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index eef355d4..5e985a53 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -168,6 +168,8 @@ def hocr_bounding_box(self) -> Optional[str]:
             self.documentai_object, self._page.documentai_object.dimension
         )
 
+    # This field is a cached property to improve export times for hOCR
+    # as outlined in https://github.com/googleapis/python-documentai-toolbox/issues/312
     @cached_property
     def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
         """

From ba33c99ad3b74744ac108251781234a1ad91ae62 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Mon, 1 Jul 2024 12:11:24 -0500
Subject: [PATCH 10/13] Attempt to fix docstring formatting

---
 google/cloud/documentai_toolbox/wrappers/page.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 5e985a53..add192f9 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -201,6 +201,7 @@ def _get_children_of_element(
             TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`.
 
         Example:
+
             ```
             page_element = PageElement(text_segment=TextSegment(0, 100))
             potential_children = [
@@ -211,6 +212,7 @@ def _get_children_of_element(
             children = page_element._get_children_of_element(potential_children)
             # children will contain only the first PageElement
             ```
+
         """
         start_index = self._text_segment.start_index
         end_index = self._text_segment.end_index

From 46aefcc46e17c852e539fb42e33cdc7d2afcabb1 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Mon, 1 Jul 2024 12:23:44 -0500
Subject: [PATCH 11/13] Remove example code from `_get_children_of_element`
 docstring

---
 .../cloud/documentai_toolbox/wrappers/page.py | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index add192f9..144c8d31 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -201,18 +201,16 @@ def _get_children_of_element(
             TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`.
 
         Example:
-
-            ```
-            page_element = PageElement(text_segment=TextSegment(0, 100))
-            potential_children = [
-                PageElement(text_segment=TextSegment(10, 20)),  # Inside
-                PageElement(text_segment=TextSegment(5, 105)),  # Overlapping
-                PageElement(text_segment=TextSegment(120, 150))  # Outside
-            ]
-            children = page_element._get_children_of_element(potential_children)
-            # children will contain only the first PageElement
-            ```
-
+        ```
+        page_element = PageElement(text_segment=TextSegment(0, 100))
+        potential_children = [
+            PageElement(text_segment=TextSegment(10, 20)),  # Inside
+            PageElement(text_segment=TextSegment(5, 105)),  # Overlapping
+            PageElement(text_segment=TextSegment(120, 150))  # Outside
+        ]
+        children = page_element._get_children_of_element(potential_children)
+        # children will contain only the first PageElement
+        ```
         """
         start_index = self._text_segment.start_index
         end_index = self._text_segment.end_index

From aeebb80d1ee3952d96f54fc48ac755e157072b07 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Mon, 1 Jul 2024 12:26:59 -0500
Subject: [PATCH 12/13] Remove Raises from docstring

---
 google/cloud/documentai_toolbox/wrappers/page.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 144c8d31..046e1411 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -196,21 +196,6 @@ def _get_children_of_element(
             List[_BasePageElement]:
                 A new list containing only the wrapped page elements that are fully
                 contained within this element, maintaining their original order.
-
-        Raises:
-            TypeError: If `potential_children` is not a list or contains elements that are not of type `_BasePageElement`.
-
-        Example:
-        ```
-        page_element = PageElement(text_segment=TextSegment(0, 100))
-        potential_children = [
-            PageElement(text_segment=TextSegment(10, 20)),  # Inside
-            PageElement(text_segment=TextSegment(5, 105)),  # Overlapping
-            PageElement(text_segment=TextSegment(120, 150))  # Outside
-        ]
-        children = page_element._get_children_of_element(potential_children)
-        # children will contain only the first PageElement
-        ```
         """
         start_index = self._text_segment.start_index
         end_index = self._text_segment.end_index

From 0faba3c26e7cf6f389ee1892b6e8de24ed0792e9 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Tue, 2 Jul 2024 15:24:09 -0500
Subject: [PATCH 13/13] Added code comments for Review comments

---
 google/cloud/documentai_toolbox/wrappers/page.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index 046e1411..35a2491e 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -84,6 +84,7 @@ def _extract_table_rows(
         """
         return [
             [
+                # Newlines removed to improve formatting for export formats.
                 _text_from_layout(cell.layout, self._page._document_text).replace(
                     "\n", ""
                 )
@@ -140,6 +141,8 @@ def _trim_text(text: str) -> str:
         str:
             Text without trailing spaces/newlines
     """
+    # Newline replacement added to correct common
+    # misshapen output from Form Parser.
     return text.strip().replace("\n", " ")