#
# ContextGem
#
# Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Module for handling documents.
This module provides the Document class, which represents a structured or unstructured file
containing written or visual content. Documents can be processed to extract information,
analyze content, and organize data into paragraphs, sentences, aspects, and concepts.
The Document class supports various operations including:
- Managing raw text and structured paragraphs
- Handling embedded or attached images
- Organizing content into aspects for focused analysis
- Associating concepts for information extraction
- Processing documents through pipelines for automated analysis
Documents serve as the primary container for content analysis within the ContextGem framework,
enabling complex document understanding and information extraction workflows.
"""
from __future__ import annotations
import itertools
import warnings
from copy import deepcopy
from typing import Any, Literal, Optional
from pydantic import Field, field_validator, model_validator
from contextgem.internal.base.attrs import _AssignedInstancesProcessor
from contextgem.internal.base.concepts import _Concept
from contextgem.internal.decorators import _post_init_method, _timer_decorator
from contextgem.internal.loggers import logger
from contextgem.internal.typings.aliases import NonEmptyStr, SaTModelId, Self
from contextgem.internal.utils import _get_sat_model, _split_text_into_paragraphs
from contextgem.public.aspects import Aspect
from contextgem.public.images import Image
from contextgem.public.paragraphs import Paragraph
from contextgem.public.pipelines import DocumentPipeline
from contextgem.public.sentences import Sentence
[docs]
class Document(_AssignedInstancesProcessor):
"""
Represents a document containing textual and visual content for analysis.
A document serves as the primary container for content analysis within the ContextGem framework,
enabling complex document understanding and information extraction workflows.
:ivar raw_text: The main text of the document as a single string.
Defaults to None.
:type raw_text: Optional[NonEmptyStr]
:ivar paragraphs: List of Paragraph instances in consecutive order as they appear
in the document. Defaults to an empty list.
:type paragraphs: list[Paragraph]
:ivar images: List of Image instances attached to or representing the document.
Defaults to an empty list.
:type images: list[Image]
:ivar aspects: List of aspects associated with the document for focused analysis.
Validated to ensure unique names and descriptions. Defaults to an empty list.
:type aspects: list[Aspect]
:ivar concepts: List of concepts associated with the document for information extraction.
Validated to ensure unique names and descriptions. Defaults to an empty list.
:type concepts: list[_Concept]
:ivar paragraph_segmentation_mode: Mode for paragraph segmentation. When set to "sat",
uses a SaT (Segment Any Text https://arxiv.org/abs/2406.16678) model. Defaults to "newlines".
:type paragraph_segmentation_mode: Literal["newlines", "sat"]
:ivar sat_model_id: SaT model ID for paragraph/sentence segmentation.
Defaults to "sat-3l-sm". See https://github.com/segment-any-text/wtpsplit for the list of available models.
:type sat_model_id: SaTModelId
Note:
Normally, you do not need to construct/populate paragraphs manually, as they are
populated automatically from document's ``raw_text`` attribute. Only use this constructor
for advanced use cases, such as when you have a custom paragraph segmentation tool.
Example:
.. literalinclude:: ../../../dev/usage_examples/docstrings/documents/def_document.py
:language: python
:caption: Document definition
"""
raw_text: Optional[NonEmptyStr] = Field(default=None)
paragraphs: list[Paragraph] = Field(default_factory=list)
images: list[Image] = Field(default_factory=list)
aspects: list[Aspect] = Field(default_factory=list)
concepts: list[_Concept] = Field(default_factory=list)
paragraph_segmentation_mode: Literal["newlines", "sat"] = Field(default="newlines")
sat_model_id: SaTModelId = Field(default="sat-3l-sm")
def __setattr__(self, name: str, value: Any) -> None:
"""
Sets the attribute of an instance, with additional restrictions on specific attributes.
:param name: The name of the attribute to set.
:type name: str
:param value: The value to assign to the attribute.
:return: None
:raises ValueError: If attempting to reassign a restricted attribute
after it has already been assigned to a *truthy* value.
"""
if name in ["raw_text", "paragraphs"]:
# Prevent raw_text/paragraphs reassignment once populated, to prevent inconsistencies in analysis.
if getattr(self, name, None):
raise ValueError(
f"The attribute `{name}` cannot be changed once populated."
)
super().__setattr__(name, value)
@property
def sentences(self) -> list[Sentence]:
"""
Provides access to all sentences within the paragraphs of the document by flattening
and combining sentences from each paragraph into a single list.
:return: A list of Sentence objects that are contained within all paragraphs.
:rtype: list[Sentence]
"""
return list(itertools.chain.from_iterable(i.sentences for i in self.paragraphs))
@_timer_decorator(
"Document initialization",
)
@_post_init_method
def _post_init(self, __context):
self._set_text_from_paras()
self._segment_paras_and_sents()
[docs]
def assign_pipeline(
self,
pipeline: DocumentPipeline,
overwrite_existing: bool = False,
) -> Self:
"""
Assigns a given pipeline to the document. The method deep-copies the input pipeline
to prevent any modifications to the state of aspects or concepts in the original pipeline.
If the aspects or concepts are already associated with the document, an error is raised
unless the `overwrite_existing` parameter is explicitly set to `True`.
:param pipeline: The DocumentPipeline object to attach to the document.
:param overwrite_existing: A boolean flag. If set to True, any existing aspects and
concepts assigned to the document will be overwritten by the new pipeline.
Defaults to False.
:return: Returns the current instance of the document after assigning the pipeline.
"""
if (self.aspects or self.concepts) and not overwrite_existing:
raise RuntimeError(
"Document already has aspects and concepts assigned. "
"Use `overwrite_existing=True` to overwrite existing aspects and concepts "
"with the pipeline."
)
document_pipeline = deepcopy(
pipeline
) # deep copy to avoid aspect/concept state modification of the pipeline
self.aspects = document_pipeline.aspects
self.concepts = document_pipeline.concepts
logger.info("Pipeline assigned to the document")
return self
def _segment_paras_and_sents(self) -> None:
"""
If no paragraphs are provided, but text exists, extracts paragraphs from text and assigns
them on the document. The ``paragraph_segmentation_mode`` value determines whether the paragraphs
will be segmented by newlines or using a SaT model.
If paragraphs exist and some of them do not have extracted sentences, extracts sentences
for such paragraphs and assigns them on the paragraphs. Sentences are always segmented
using the SaT model.
Does nothing if only images are provided without text or paragraphs.
"""
if self.raw_text and not self.paragraphs:
# Extract paragraphs from text, if text provided without paragraphs
logger.info(
"Text is being split into paragraphs, as no custom paragraphs were provided..."
)
if self.paragraph_segmentation_mode == "newlines":
paragraphs: list[str] = _split_text_into_paragraphs(self.raw_text)
elif self.paragraph_segmentation_mode == "sat":
paragraphs: list[list[str]] = _get_sat_model(self.sat_model_id).split(
self.raw_text,
do_paragraph_segmentation=True,
)
paragraphs = ["".join(i) for i in paragraphs]
else:
raise ValueError(
f"Invalid paragraph segmentation mode: {self.paragraph_segmentation_mode}"
)
if not paragraphs:
raise ValueError("No valid paragraphs in text")
# Assign paragraphs on the document
paragraphs: list[Paragraph] = [Paragraph(raw_text=i) for i in paragraphs]
# Check that each paragraph is found in the document text
# For duplicate paragraphs, verify each occurrence is matched in the document
remaining_text = self.raw_text
for paragraph in paragraphs:
if paragraph.raw_text not in remaining_text:
raise ValueError(
"Not all segmented paragraphs were matched in document text."
)
# Remove the first occurrence to handle duplicates correctly
remaining_text = remaining_text.replace(paragraph.raw_text, "", 1)
self.paragraphs = paragraphs
if self.paragraphs:
# Extract sentences for each paragraph without sentences provided
if not all(i.sentences for i in self.paragraphs):
logger.info("Paragraphs are being split into sentences...")
if any(i.sentences for i in self.paragraphs):
warnings.warn(
"Some paragraphs already have sentences. "
"These will be used `as is`."
)
split_sents_for_paras = _get_sat_model(self.sat_model_id).split(
[p.raw_text for p in self.paragraphs]
)
for paragraph, sent_group in zip(
self.paragraphs, split_sents_for_paras
):
if not paragraph.sentences:
# Filter out empty sents, if any
sent_group = [i.strip() for i in sent_group]
sent_group = [i for i in sent_group if len(i)]
assert all(
i in paragraph.raw_text for i in sent_group
), "Not all segmented sentences were matched in paragraph text."
paragraph.sentences = [
Sentence(
raw_text=i, custom_data=paragraph.custom_data
) # inherit custom data from paragraph object
for i in sent_group
]
def _set_text_from_paras(self) -> None:
"""
Sets the text attribute for the object by combining text from paragraphs.
This method checks if the `paragraphs` attribute of the object exists and
is not empty, while the `text` attribute is empty. If these conditions
are met, it merges the text content of all the paragraphs and assigns it
to the `text` attribute.
:return: None
"""
if self.paragraphs and not self.raw_text:
logger.info("Text is being set from paragraphs...")
self.raw_text = "\n\n".join([i.raw_text for i in self.paragraphs])
@field_validator("images")
@classmethod
def _validate_images(cls, images: list[Image]) -> list[Image]:
"""
Validates the uniqueness of document images provided in the document.
:param images: A list of `Image` objects to validate.
:return: The original list of `Image` objects if all images are unique.
:raises ValueError: If a duplicate image is found in the list, based on its
`base64_data` content.
"""
seen = set()
for image in images:
if image.base64_data in seen:
raise ValueError(
f"Image already exists in the document. All images must be unique."
)
seen.add(image.base64_data)
return images
@model_validator(mode="before")
@classmethod
def _validate_document_pre(cls, data: Any) -> Any:
"""
Validates the document's raw input data, which could be a dict with input values,
an instance of the model, or another type depending on what is passed to the model.
:raises ValueError:
- If none of `raw_text`, `paragraphs`, or `images` are provided.
:return:
The validated data.
"""
if isinstance(data, dict):
if (
not data.get("raw_text")
and not data.get("paragraphs")
and not data.get("images")
):
raise ValueError(
"Either raw_text, paragraphs, or images must be provided for the document."
)
return data
@model_validator(mode="after")
def _validate_document_post(self) -> Self:
"""
Validates the consistency between the `text` attribute and the `paragraphs` attribute
of the instance. Specifically, verifies that if both `text` and `paragraphs` are provided,
each paragraph's `text` must exist in the overall document's `text`.
Does nothing if both `text` and `paragraphs` are not provided.
:param self: The instance of the model being validated.
:return: The validated instance of the model.
:raises ValueError: If the `text` attribute exists, and not all paragraphs are matched in
the overall document's text.
"""
if self.raw_text and self.paragraphs:
# Check that all paragraphs exist in the document text
if not all(i.raw_text in self.raw_text for i in self.paragraphs):
raise ValueError("Not all paragraphs were matched in document text.")
# Check that paragraphs are ordered according to their appearance in the raw text
# Handle case where paragraphs may have duplicate text content
current_search_pos = 0
for i in range(len(self.paragraphs) - 1):
# Find current paragraph starting from the current search position
current_pos = self.raw_text.find(
self.paragraphs[i].raw_text, current_search_pos
)
if current_pos == -1: # This shouldn't happen due to earlier check
current_pos = self.raw_text.find(self.paragraphs[i].raw_text)
# Update search position for next paragraph to start after current paragraph
current_search_pos = current_pos + len(self.paragraphs[i].raw_text)
# Find next paragraph starting from the current search position
next_pos = self.raw_text.find(
self.paragraphs[i + 1].raw_text, current_search_pos
)
if (
next_pos == -1
): # If not found from current position, check if it exists earlier
next_pos = self.raw_text.find(self.paragraphs[i + 1].raw_text)
if next_pos < current_search_pos:
raise ValueError(
"Paragraphs are not ordered according to their appearance in the document text."
)
return self