Source code for contextgem.public.converters.docx

#
# ContextGem
#
# Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
DOCX document conversion module for ContextGem.

Provides functionality for converting Microsoft Word DOCX files into ContextGem document objects,
preserving text, structure, tables, footnotes, headers, footers, and embedded images.
Implemented through the DocxConverter class.
"""

from __future__ import annotations

import warnings
from pathlib import Path
from typing import BinaryIO, cast

from contextgem.internal.converters.docx.base import _DocxConverterBase
from contextgem.internal.converters.docx.package import _DocxPackage
from contextgem.internal.exceptions import DocxConverterError
from contextgem.internal.loggers import logger
from contextgem.internal.typings.types import TextMode
from contextgem.public.documents import Document
from contextgem.public.paragraphs import Paragraph



[docs]
class DocxConverter(_DocxConverterBase):
    """
    Converter for DOCX files into ContextGem documents.

    This class handles extraction of text, formatting, tables, images, footnotes,
    comments, and other elements from DOCX files by directly parsing Word XML.

    The converter is read-only and does not modify the source DOCX file
    in any way. It only extracts content for conversion to ContextGem document object
    or text formats.

    The resulting ContextGem document is populated with the following:

    - Raw text: The raw text of the DOCX file.

    - Paragraphs: Paragraph objects with the following metadata:

      - Raw text: The raw text of the paragraph.
      - Additional context: Metadata about the paragraph's style, list level,
        table cell position, being part of a footnote or comment, etc. This context
        provides additional information that is useful for LLM analysis and extraction.

    - Images: Image objects constructed from embedded images in the DOCX file.

    Example:
        .. literalinclude:: ../../../dev/usage_examples/readme/docx_converter.py
            :language: python
            :caption: DocxConverter usage example
    """


[docs]
    def convert_to_text_format(
        self,
        docx_path_or_file: str | Path | BinaryIO,
        output_format: TextMode = "markdown",
        include_tables: bool = True,
        include_comments: bool = True,
        include_footnotes: bool = True,
        include_headers: bool = True,
        include_footers: bool = True,
        include_textboxes: bool = True,
        include_links: bool = True,
        include_inline_formatting: bool = True,
        strict_mode: bool = False,
    ) -> str:
        """
        Converts a DOCX file directly to text without creating a ContextGem Document.

        :param docx_path_or_file: Path to the DOCX file (as string or Path object) or a file-like object
        :param output_format: Output format ("markdown" or "raw") (default: "markdown")
        :param include_tables: If True, include tables in the output (default: True)
        :param include_comments: If True, include comments in the output (default: True)
        :param include_footnotes: If True, include footnotes in the output (default: True)
        :param include_headers: If True, include headers in the output (default: True)
        :param include_footers: If True, include footers in the output (default: True)
        :param include_textboxes: If True, include textbox content (default: True)
        :param include_links: If True, process and format hyperlinks (default: True)
        :param include_inline_formatting: If True, apply inline formatting (bold, italic, etc.)
            in markdown mode (default: True)
        :param strict_mode: If True, raise exceptions for any processing error
            instead of skipping problematic elements (default: False)
        :return: Text in the specified format

        .. note::
           When using markdown output format, the following conditions apply:

           * Document structure elements (headings, lists, tables) are preserved
           * Headings are converted to markdown heading syntax (# Heading 1, ## Heading 2, etc.)
           * Lists are converted to markdown list syntax, preserving numbering and hierarchy
           * Tables are formatted using markdown table syntax
           * Footnotes, comments, headers, and footers are included as specially marked sections
        """
        # Validate file extension first
        self._validate_file_extension(docx_path_or_file)

        package = None

        try:
            package = _DocxPackage(docx_path_or_file)

            if output_format.lower() == "markdown":
                # Process document elements into markdown lines
                markdown_lines = self._process_docx_elements(
                    package,
                    markdown_mode=True,
                    include_tables=include_tables,
                    include_comments=include_comments,
                    include_footnotes=include_footnotes,
                    include_headers=include_headers,
                    include_footers=include_footers,
                    include_textboxes=include_textboxes,
                    include_links=include_links,
                    strict_mode=strict_mode,
                    include_inline_formatting=include_inline_formatting,
                )

                # Join all lines and return as a single string
                # Safe cast: markdown_lines is returned as a list of strings
                return "\n".join(cast(list[str], markdown_lines))
            elif output_format.lower() == "raw":
                # Process document elements
                paragraphs = self._process_docx_elements(
                    package,
                    markdown_mode=False,
                    include_tables=include_tables,
                    include_comments=include_comments,
                    include_footnotes=include_footnotes,
                    include_headers=include_headers,
                    include_footers=include_footers,
                    include_textboxes=include_textboxes,
                    include_links=include_links,
                    strict_mode=strict_mode,
                    include_inline_formatting=include_inline_formatting,
                )

                # Combine all paragraph texts
                # Safe cast: paragraphs is returned as a list of Paragraph objects
                return "\n\n".join(
                    para.raw_text for para in cast(list[Paragraph], paragraphs)
                )
            else:
                raise DocxConverterError(f"Invalid output format: {output_format}")
        except DocxConverterError:
            # Re-raise specific converter errors
            raise
        except Exception as e:
            # Convert generic exceptions to DocxConverterError
            logger.error(f"Error converting DOCX to {output_format}: {e}")
            raise DocxConverterError(
                f"Error converting DOCX to {output_format}: {e}"
            ) from e
        finally:
            # Ensure the package is closed even if an exception occurs
            if package:
                package.close()



[docs]
    def convert(
        self,
        docx_path_or_file: str | Path | BinaryIO,
        apply_markdown: bool = True,
        raw_text_to_md: bool | None = None,  # TODO: remove this parameter in v1.0.0.
        include_tables: bool = True,
        include_comments: bool = True,
        include_footnotes: bool = True,
        include_headers: bool = True,
        include_footers: bool = True,
        include_textboxes: bool = True,
        include_images: bool = True,
        include_links: bool = True,
        include_inline_formatting: bool = True,
        strict_mode: bool = False,
    ) -> Document:
        """
        Converts a DOCX file into a ContextGem Document object.

        :param docx_path_or_file: Path to the DOCX file (as string or Path object) or a file-like object
        :param apply_markdown: If True, applies markdown processing and formatting to the document content
            while preserving raw text separately (default: True)
        :param raw_text_to_md: [DEPRECATED] Use apply_markdown instead. Will be removed in v1.0.0.
            Note: This parameter previously controlled whether raw_text would contain raw or markdown text.
            The new apply_markdown parameter instead controls whether to apply markdown processing
            while keeping raw text and processed text separate.
        :param include_tables: If True, include tables in the output (default: True)
        :param include_comments: If True, include comments in the output (default: True)
        :param include_footnotes: If True, include footnotes in the output (default: True)
        :param include_headers: If True, include headers in the output (default: True)
        :param include_footers: If True, include footers in the output (default: True)
        :param include_textboxes: If True, include textbox content (default: True)
        :param include_images: If True, extract and include images (default: True)
        :param include_links: If True, process and format hyperlinks (default: True)
        :param include_inline_formatting: If True, apply inline formatting (bold, italic, etc.)
            in markdown mode (default: True)
        :param strict_mode: If True, raise exceptions for any processing error
            instead of skipping problematic elements (default: False)
        :return: A populated Document object
        """
        # Validate file extension first
        self._validate_file_extension(docx_path_or_file)

        # Handle deprecated parameter
        if raw_text_to_md is not None:
            warnings.warn(
                "The 'raw_text_to_md' parameter is deprecated and will be removed in v1.0.0. "
                "Please use 'apply_markdown' instead. Note: This change affects how text processing "
                "is handled - the Document now maintains separate raw and processed text representations.",
                DeprecationWarning,
                stacklevel=2,
            )
            apply_markdown = raw_text_to_md

        package = None
        try:
            # Get file name or descriptor for logging
            file_desc = (
                docx_path_or_file
                if isinstance(docx_path_or_file, str | Path)
                else "file object"
            )
            logger.info(f"Converting DOCX: {file_desc} (strict mode: {strict_mode})")

            # Create _DocxPackage
            package = _DocxPackage(docx_path_or_file)

            # Process document elements and get paragraphs
            logger.debug("Processing document elements")
            paragraphs = self._process_docx_elements(
                package,
                markdown_mode=False,  # Always get Paragraph objects, but we'll handle text formatting separately
                include_tables=include_tables,
                include_comments=include_comments,
                include_footnotes=include_footnotes,
                include_headers=include_headers,
                include_footers=include_footers,
                include_textboxes=include_textboxes,
                include_links=include_links,
                include_inline_formatting=include_inline_formatting,
                use_markdown_text_in_paragraphs=apply_markdown,
                populate_md_text=apply_markdown,
                strict_mode=strict_mode,
            )
            logger.debug(f"Extracted {len(paragraphs)} paragraphs")

            # Generate raw text from the paragraph objects we already have
            # Safe cast: paragraphs is returned as a list of Paragraph objects
            raw_text = "\n\n".join(
                para.raw_text for para in cast(list[Paragraph], paragraphs)
            )
            doc_kwargs = {
                "raw_text": raw_text,
                "paragraphs": paragraphs,
            }

            # Create the document object
            context_doc = Document(**doc_kwargs)

            if apply_markdown:
                # Generate markdown text from the same paragraphs we extracted
                markdown_lines = self._process_docx_elements(
                    package,
                    markdown_mode=True,
                    include_tables=include_tables,
                    include_comments=include_comments,
                    include_footnotes=include_footnotes,
                    include_headers=include_headers,
                    include_footers=include_footers,
                    include_textboxes=include_textboxes,
                    include_links=include_links,
                    include_inline_formatting=include_inline_formatting,
                    strict_mode=strict_mode,
                )
                # Safe cast: markdown_lines is returned as a list of strings
                md_text = "\n".join(cast(list[str], markdown_lines))

                # When markdown mode is requested, populate _md_text
                context_doc._md_text = md_text

            # Process images from DOCX if requested
            if include_images:
                logger.debug("Processing images")
                images = self._extract_images(package, strict_mode=strict_mode)
                # Attach images to the document
                context_doc.images = images
                logger.debug(f"Added {len(images)} images to document")

            logger.info(
                f"DOCX conversion completed successfully: {len(paragraphs)} paragraphs, "
                f"{len(context_doc.images) if include_images else 0} images"
            )
            return context_doc
        except DocxConverterError:
            # Re-raise specific converter errors
            raise
        except Exception as e:
            # Catch any other exceptions and convert to DocxConverterError
            logger.error(f"Error converting DOCX file: {e}")
            raise DocxConverterError(f"Error converting DOCX file: {e}") from e
        finally:
            # Ensure the package is closed even if an exception occurs
            if package:
                package.close()