Source code for contextgem.public.converters.docx

#
# ContextGem
#
# Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
DOCX document conversion module for ContextGem.

Provides functionality for converting Microsoft Word DOCX files into ContextGem document objects,
preserving text, structure, tables, footnotes, headers, footers, and embedded images.
Implemented through the DocxConverter class.
"""

import base64
import os
import re
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
from typing import BinaryIO, Optional

from contextgem.internal.loggers import logger
from contextgem.internal.typings.aliases import RawTextMode
from contextgem.public.documents import Document
from contextgem.public.images import Image
from contextgem.public.paragraphs import Paragraph


# Define custom exceptions
class DocxConverterError(Exception):
    """Base exception class for DOCX converter errors."""

    pass


class DocxFormatError(DocxConverterError):
    """Exception raised when the DOCX file format is invalid or corrupted."""

    pass


class DocxXmlError(DocxConverterError):
    """Exception raised when there's an error parsing XML in the DOCX file."""

    pass


class DocxContentError(DocxConverterError):
    """Exception raised when required content is missing from the DOCX file."""

    pass


# Define XML namespaces used in DOCX files
NAMESPACES = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "rels": "http://schemas.openxmlformats.org/package/2006/relationships",
    "v": "urn:schemas-microsoft-com:vml",
    "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
}


class DocxPackage:
    """
    Represents a DOCX file as a package and provides access to its XML parts.
    """

    def __init__(self, docx_path_or_file: str | Path | BinaryIO):
        """
        Initialize with either a path to a DOCX file or a file-like object.

        :param docx_path_or_file: Path to DOCX file (as string or Path object) or file-like object
        """
        self.archive = None
        self.rels = {}
        self.main_document = None
        self.styles = None
        self.numbering = None
        self.footnotes = None
        self.comments = None
        self.headers = {}
        self.footers = {}
        self.images = {}

        file_desc = (
            docx_path_or_file
            if isinstance(docx_path_or_file, (str, Path))
            else "file object"
        )

        try:
            self.archive = zipfile.ZipFile(docx_path_or_file)
        except zipfile.BadZipFile:
            raise DocxFormatError(f"'{file_desc}' is not a valid ZIP file")
        except FileNotFoundError:
            raise DocxFormatError(f"File '{file_desc}' not found")
        except PermissionError:
            raise DocxFormatError(f"Permission denied when accessing '{file_desc}'")
        except Exception as e:
            raise DocxFormatError(
                f"Failed to open DOCX file '{file_desc}': {str(e)}"
            ) from e

        try:
            # Check if this is actually a DOCX file by looking for key parts
            if "word/document.xml" not in self.archive.namelist():
                raise DocxFormatError(
                    f"'{file_desc}' is not a valid DOCX file (missing word/document.xml)"
                )

            # Load main parts
            self._load_relationships()
            self._load_main_document()
            self._load_styles()
            self._load_numbering()
            self._load_footnotes()
            self._load_comments()
            self._load_headers_footers()
            self._load_images()
        except DocxConverterError:
            # Re-raise specific converter errors
            raise
        except ET.ParseError as e:
            raise DocxXmlError(f"XML parsing error in '{file_desc}': {str(e)}") from e
        except KeyError as e:
            raise DocxContentError(
                f"Missing required content in '{file_desc}': {str(e)}"
            ) from e
        except Exception as e:
            raise DocxConverterError(
                f"Error processing DOCX file '{file_desc}': {str(e)}"
            ) from e

    def _load_xml_part(self, part_path: str) -> Optional[ET.Element]:
        """
        Loads an XML part from the DOCX package.

        :param part_path: Path to the XML part within the DOCX package
        :return: ElementTree Element or None if the part doesn't exist
        """
        if part_path not in self.archive.namelist():
            return None

        try:
            data = self.archive.read(part_path)
            return ET.fromstring(data)
        except ET.ParseError as e:
            raise DocxXmlError(f"Failed to parse XML in '{part_path}': {str(e)}") from e
        except Exception as e:
            raise DocxConverterError(f"Error reading '{part_path}': {str(e)}") from e

    def _load_relationships(self):
        """
        Loads the document relationship definitions that connect document parts.
        """
        # Document relationships
        self.rels["document"] = {}
        if "word/_rels/document.xml.rels" in self.archive.namelist():
            doc_rels_root = self._load_xml_part("word/_rels/document.xml.rels")
            if doc_rels_root is not None:
                self.rels["document"] = {
                    rel.attrib["Id"]: {
                        "type": rel.attrib["Type"],
                        "target": rel.attrib["Target"],
                    }
                    for rel in doc_rels_root.findall(".//rels:Relationship", NAMESPACES)
                }

    def _load_main_document(self):
        """
        Loads the main document.xml content.
        """
        self.main_document = self._load_xml_part("word/document.xml")
        if self.main_document is None:
            raise DocxContentError(
                "Main document (word/document.xml) is missing or invalid"
            )

    def _load_styles(self):
        """
        Loads the styles.xml content.
        """
        self.styles = self._load_xml_part("word/styles.xml")

    def _load_numbering(self):
        """
        Loads the numbering.xml content for lists.
        """
        self.numbering = self._load_xml_part("word/numbering.xml")

    def _load_footnotes(self):
        """
        Loads the footnotes.xml content.
        """
        self.footnotes = self._load_xml_part("word/footnotes.xml")

    def _load_comments(self):
        """
        Loads the comments.xml content.
        """
        self.comments = self._load_xml_part("word/comments.xml")

    def _load_headers_footers(self):
        """
        Loads headers and footers referenced in the document.
        """
        if not self.rels.get("document"):
            return

        # Find all header and footer relationships
        for rel_id, rel_info in self.rels["document"].items():
            rel_type = rel_info["type"].lower()
            target = rel_info["target"]

            # Handle relative paths
            if not target.startswith("/"):
                target = f"word/{target}"
            else:
                # Remove leading slash
                target = target[1:]

            # Load headers
            if "header" in rel_type:
                try:
                    header_content = self._load_xml_part(target)
                    if header_content is not None:
                        self.headers[rel_id] = {
                            "target": target,
                            "content": header_content,
                        }
                except DocxXmlError:
                    # Re-raise XML errors
                    raise
                except Exception as e:
                    raise DocxConverterError(
                        f"Error loading header '{target}': {str(e)}"
                    ) from e

            # Load footers
            elif "footer" in rel_type:
                try:
                    footer_content = self._load_xml_part(target)
                    if footer_content is not None:
                        self.footers[rel_id] = {
                            "target": target,
                            "content": footer_content,
                        }
                except DocxXmlError:
                    # Re-raise XML errors
                    raise
                except Exception as e:
                    raise DocxConverterError(
                        f"Error loading footer '{target}': {str(e)}"
                    ) from e

    def _load_images(self):
        """
        Loads all images embedded in the document.
        """
        if not self.rels.get("document"):
            return

        for rel_id, rel_info in self.rels["document"].items():
            if "image" in rel_info["type"].lower():
                target = rel_info["target"]
                # Handle relative paths
                if not target.startswith("/"):
                    target = f"word/{target}"
                else:
                    # Remove leading slash
                    target = target[1:]

                try:
                    if target in self.archive.namelist():
                        image_data = self.archive.read(target)
                        self.images[rel_id] = {
                            "data": image_data,
                            "target": target,
                            # Extract mime type from target extension
                            "mime_type": self._get_mime_type(target),
                        }
                except Exception as e:
                    raise DocxConverterError(
                        f"Error loading image '{target}': {str(e)}"
                    ) from e

    def _get_mime_type(self, target: str) -> str:
        """
        Determines the MIME type from the file extension.

        :param target: Image file path
        :return: MIME type string
        """
        ext = os.path.splitext(target.lower())[1]
        mime_types = {
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".webp": "image/webp",
        }
        return mime_types.get(ext, "image/png")  # Default to PNG if unknown

    def close(self):
        """Closes the zip archive."""
        if self.archive:
            try:
                self.archive.close()
            except Exception as e:
                # Just log the error but don't raise, as this is cleanup code
                logger.warning(f"Error closing DOCX archive: {str(e)}")


[docs] class DocxConverter: """ Converter for DOCX files into ContextGem documents. This class handles extraction of text, formatting, tables, images, footnotes, comments, and other elements from DOCX files by directly parsing Word XML. The resulting ContextGem document is populated with the following: - Raw text: The raw text of the DOCX file converted to markdown or left as raw text, based on the ``raw_text_to_md`` flag. - Paragraphs: Paragraph objects with the following metadata: - Raw text: The raw text of the paragraph. - Additional context: Metadata about the paragraph's style, list level, table cell position, being part of a footnote or comment, etc. This context provides additional information that is useful for LLM analysis and extraction. - Images: Image objects constructed from embedded images in the DOCX file. Example: .. literalinclude:: ../../../dev/usage_examples/readme/docx_converter.py :language: python :caption: DocxConverter usage example """ def _get_style_name(self, style_id: str, package: DocxPackage) -> str: """ Gets the style name from its ID by looking it up in the styles.xml. :param style_id: Style ID to look up :param package: DocxPackage object containing the styles :return: Style name or the style_id if not found """ if not style_id: return "Normal" if package.styles is None: return style_id or "Normal" try: style_element = package.styles.find( f".//w:style[@w:styleId='{style_id}']", NAMESPACES ) if style_element is not None: name_element = style_element.find("w:name", NAMESPACES) if name_element is not None and "val" in name_element.attrib.get( f"{{{NAMESPACES['w']}}}val", "" ): return name_element.attrib[f"{{{NAMESPACES['w']}}}val"] except Exception as e: # If there's an error finding the style, log it but continue with default logger.warning(f"Error looking up style '{style_id}': {str(e)}") return style_id or "Normal" def _get_paragraph_style(self, para_element: ET.Element) -> str: """ Extracts the style information from a paragraph element. :param para_element: Paragraph XML element :return: Style ID string """ # Find the paragraph properties element p_pr = para_element.find("w:pPr", NAMESPACES) if p_pr is not None: # Find the style element within paragraph properties style = p_pr.find("w:pStyle", NAMESPACES) if style is not None and f"{{{NAMESPACES['w']}}}val" in style.attrib: return style.attrib[f"{{{NAMESPACES['w']}}}val"] return "Normal" def _get_list_info( self, para_element: ET.Element, package: DocxPackage ) -> tuple[bool, int, str, str, bool]: """ Extracts list information from a paragraph element. :param para_element: Paragraph XML element :param package: DocxPackage object :return: Tuple of (is_list, list_level, list_info_string, list_type, is_numbered) """ is_list = False list_level = 0 list_info = "" list_type = "" is_numbered = False p_pr = para_element.find("w:pPr", NAMESPACES) if p_pr is not None: num_pr = p_pr.find("w:numPr", NAMESPACES) if num_pr is not None: is_list = True # Get list ID num_id_elem = num_pr.find("w:numId", NAMESPACES) num_id = ( num_id_elem.attrib[f"{{{NAMESPACES['w']}}}val"] if num_id_elem is not None else None ) # Get level ilvl_elem = num_pr.find("w:ilvl", NAMESPACES) if ilvl_elem is not None: list_level = int(ilvl_elem.attrib[f"{{{NAMESPACES['w']}}}val"]) # Determine list type and numbering format if numbering is available if num_id and package.numbering is not None: # First find the abstractNumId associated with this numId num_def = package.numbering.find( f".//w:num[@w:numId='{num_id}']", NAMESPACES ) if num_def is not None: abstract_num_id_elem = num_def.find( "w:abstractNumId", NAMESPACES ) if abstract_num_id_elem is not None: abstract_num_id = abstract_num_id_elem.attrib[ f"{{{NAMESPACES['w']}}}val" ] # Now find the level formatting in the abstractNum abstract_num = package.numbering.find( f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']", NAMESPACES, ) if abstract_num is not None: # Find the level formatting for this specific level level_elem = abstract_num.find( f".//w:lvl[@w:ilvl='{list_level}']", NAMESPACES ) if level_elem is not None: # Get the numFmt element which defines if it's bullet or numbered num_fmt = level_elem.find("w:numFmt", NAMESPACES) if ( num_fmt is not None and f"{{{NAMESPACES['w']}}}val" in num_fmt.attrib ): fmt_val = num_fmt.attrib[ f"{{{NAMESPACES['w']}}}val" ] list_type = fmt_val # Check if it's a numbered list format numbered_formats = { "decimal", "decimalZero", "upperRoman", "lowerRoman", "upperLetter", "lowerLetter", "ordinal", "cardinalText", "ordinalText", "hex", "chicago", "ideographDigital", "japaneseCounting", "aiueo", "iroha", "arabicFullWidth", "hindiNumbers", "thaiNumbers", } is_numbered = fmt_val in numbered_formats if num_id: list_info = f", List ID: {num_id}, Level: {list_level}" if list_type: list_info += f", Format: {list_type}" return is_list, list_level, list_info, list_type, is_numbered def _extract_footnote_references(self, para_element: ET.Element) -> list[str]: """ Extracts footnote references from a paragraph. :param para_element: Paragraph XML element :return: List of footnote IDs """ footnote_ids = [] # Find all footnote references in this paragraph for run in para_element.findall(".//w:r", NAMESPACES): footnote_ref = run.find(".//w:footnoteReference", NAMESPACES) if ( footnote_ref is not None and f"{{{NAMESPACES['w']}}}id" in footnote_ref.attrib ): footnote_id = footnote_ref.attrib[f"{{{NAMESPACES['w']}}}id"] footnote_ids.append(footnote_id) return footnote_ids def _extract_comment_references(self, para_element: ET.Element) -> list[str]: """ Extracts comment references from a paragraph. :param para_element: Paragraph XML element :return: List of comment IDs """ comment_ids = [] # Find all comment references in this paragraph for run in para_element.findall(".//w:r", NAMESPACES): comment_ref = run.find(".//w:commentReference", NAMESPACES) if ( comment_ref is not None and f"{{{NAMESPACES['w']}}}id" in comment_ref.attrib ): comment_id = comment_ref.attrib[f"{{{NAMESPACES['w']}}}id"] comment_ids.append(comment_id) return comment_ids def _process_footnotes( self, package: DocxPackage, strict_mode: bool = False, include_textboxes: bool = True, ) -> list[Paragraph]: """ Processes footnotes from the footnotes.xml file and converts them to Paragraph objects. :param package: DocxPackage object :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: List of Paragraph objects representing footnotes """ footnote_paragraphs = [] if package.footnotes is None: return footnote_paragraphs # Find all footnote elements (excluding separators and continuation separators) for footnote_elem in package.footnotes.findall(".//w:footnote", NAMESPACES): # Skip special footnotes (separators and continuation notices) if f"{{{NAMESPACES['w']}}}id" not in footnote_elem.attrib: continue footnote_id = footnote_elem.attrib[f"{{{NAMESPACES['w']}}}id"] if footnote_id in ("-1", "0"): # Separator and continuation separator continue # Process each paragraph in the footnote for para in footnote_elem.findall(".//w:p", NAMESPACES): # Extract the text content para_text = self._extract_paragraph_text( para, strict_mode=strict_mode, include_textboxes=include_textboxes ).strip() if para_text: # Get paragraph style and metadata style_id = self._get_paragraph_style(para) style_name = self._get_style_name(style_id, package) # Include footnote ID in the metadata footnote_info = f"Style: {style_name}, Footnote: {footnote_id}" # Create paragraph object with metadata footnote_paragraphs.append( Paragraph(raw_text=para_text, additional_context=footnote_info) ) return footnote_paragraphs def _process_comments( self, package: DocxPackage, strict_mode: bool = False, include_textboxes: bool = True, ) -> list[Paragraph]: """ Processes comments from the comments.xml file and converts them to Paragraph objects. :param package: DocxPackage object :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: List of Paragraph objects representing comments """ comment_paragraphs = [] if package.comments is None: return comment_paragraphs # Find all comment elements with explicit namespace comment_elements = package.comments.findall( f".//{{{NAMESPACES['w']}}}comment", NAMESPACES ) for comment_elem in comment_elements: # Skip comments without an ID if f"{{{NAMESPACES['w']}}}id" not in comment_elem.attrib: continue comment_id = comment_elem.attrib[f"{{{NAMESPACES['w']}}}id"] # Get comment author if available author = "" if f"{{{NAMESPACES['w']}}}author" in comment_elem.attrib: author = comment_elem.attrib[f"{{{NAMESPACES['w']}}}author"] # Get comment date if available date = "" if f"{{{NAMESPACES['w']}}}date" in comment_elem.attrib: date = comment_elem.attrib[f"{{{NAMESPACES['w']}}}date"] # Process each paragraph in the comment with explicit namespace for para in comment_elem.findall(f".//{{{NAMESPACES['w']}}}p", NAMESPACES): # Extract the text content para_text = self._extract_paragraph_text( para, strict_mode=strict_mode, include_textboxes=include_textboxes ).strip() if para_text: # Get paragraph style and metadata style_id = self._get_paragraph_style(para) style_name = self._get_style_name(style_id, package) # Build metadata comment_info = f"Style: {style_name}, Comment: {comment_id}" if author: comment_info += f", Author: {author}" if date: comment_info += f", Date: {date}" # Create paragraph object with metadata comment_paragraphs.append( Paragraph(raw_text=para_text, additional_context=comment_info) ) return comment_paragraphs def _extract_paragraph_text( self, para_element: ET.Element, strict_mode: bool = False, include_textboxes: bool = True, ) -> str: """ Extracts the text content from a paragraph element. :param para_element: Paragraph XML element :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: Text content of the paragraph """ # Use a dictionary to track text content by location text_by_location = {} ordered_runs = [] # Track processed element IDs to avoid technical duplicates processed_elem_ids = set() try: # Process regular paragraph text first (most common case) run_idx = 0 for run in para_element.findall(".//w:r", NAMESPACES): # Skip runs that are part of drawings (we'll handle them separately) if run.find(".//w:drawing", NAMESPACES) is not None: continue # Process text elements in this run for text_elem in run.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[run_idx] = text_elem.text ordered_runs.append(run_idx) processed_elem_ids.add(elem_id) run_idx += 1 # Process line breaks in this run for br in run.findall(".//w:br", NAMESPACES): text_by_location[run_idx] = "\n" ordered_runs.append(run_idx) run_idx += 1 # Add footnote reference marker if this run contains a footnote reference footnote_ref = run.find(".//w:footnoteReference", NAMESPACES) if ( footnote_ref is not None and f"{{{NAMESPACES['w']}}}id" in footnote_ref.attrib ): footnote_id = footnote_ref.attrib[f"{{{NAMESPACES['w']}}}id"] text_by_location[run_idx] = ( f"[Footnote {footnote_id}]" # Use footnote ID as marker ) ordered_runs.append(run_idx) run_idx += 1 # Add comment reference marker if this run contains a comment reference comment_ref = run.find(".//w:commentReference", NAMESPACES) if ( comment_ref is not None and f"{{{NAMESPACES['w']}}}id" in comment_ref.attrib ): comment_id = comment_ref.attrib[f"{{{NAMESPACES['w']}}}id"] text_by_location[run_idx] = ( f"[Comment {comment_id}]" # Use comment ID as marker ) ordered_runs.append(run_idx) run_idx += 1 # Process drawing objects (incl. text boxes) - these need special handling # Skip this section if include_textboxes is False if include_textboxes: # We keep track of which drawings we've seen to avoid duplicates but still permit # intentional repetition of text boxes # Group 1: Standard VML textboxes vml_idx = 1000 for textbox in para_element.findall(".//v:textbox", NAMESPACES): for text_elem in textbox.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[vml_idx] = text_elem.text ordered_runs.append(vml_idx) processed_elem_ids.add(elem_id) vml_idx += 1 # Group 2: DrawingML textboxes (Office 2007+ format) dml_idx = 2000 txbx_content_elems = para_element.findall( ".//w:txbxContent", NAMESPACES ) for txbx_content in txbx_content_elems: # Process each paragraph in the text box content for p in txbx_content.findall(".//w:p", NAMESPACES): for text_elem in p.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[dml_idx] = text_elem.text ordered_runs.append(dml_idx) processed_elem_ids.add(elem_id) dml_idx += 1 # Group 3: DrawingML text directly in shapes shape_idx = 3000 for text_elem in para_element.findall(".//a:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[shape_idx] = text_elem.text ordered_runs.append(shape_idx) processed_elem_ids.add(elem_id) shape_idx += 1 # Group 4: Drawing elements that might contain text not captured by other groups drawing_idx = 4000 for drawing in para_element.findall(".//w:drawing", NAMESPACES): # Extract any text elements that might be in the drawing but not covered by previous groups for text_elem in drawing.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[drawing_idx] = text_elem.text ordered_runs.append(drawing_idx) processed_elem_ids.add(elem_id) drawing_idx += 1 # Group 5: Handle Markup Compatibility (mc) alternate content # This is crucial because Word often uses this for cross-version compatibility # and the same content can appear in both the Choice and Fallback sections mc_idx = 5000 for mc_elem in para_element.findall(".//mc:AlternateContent", NAMESPACES): # First try the Choice content (preferred for newer versions of Word) choice_elems = mc_elem.findall(".//mc:Choice", NAMESPACES) fallback_elems = mc_elem.findall(".//mc:Fallback", NAMESPACES) # We only want to process either Choice OR Fallback, not both, as they represent # alternate representations of the same content if choice_elems: for choice in choice_elems: # If include_textboxes is False, skip textboxes in markup compatibility content if not include_textboxes: # Skip textbox content within this choice element has_textbox = ( choice.find(".//v:textbox", NAMESPACES) is not None or choice.find(".//w:txbxContent", NAMESPACES) is not None or choice.find(".//a:t", NAMESPACES) is not None or choice.find(".//w:drawing", NAMESPACES) is not None ) if has_textbox: continue for text_elem in choice.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[mc_idx] = text_elem.text ordered_runs.append(mc_idx) processed_elem_ids.add(elem_id) mc_idx += 1 # Only use Fallback if we didn't find any usable Choice elements elif fallback_elems: # Check if we've already extracted text from a Choice element for fallback in fallback_elems: # If include_textboxes is False, skip textboxes in markup compatibility content if not include_textboxes: # Skip textbox content within this fallback element has_textbox = ( fallback.find(".//v:textbox", NAMESPACES) is not None or fallback.find(".//w:txbxContent", NAMESPACES) is not None or fallback.find(".//a:t", NAMESPACES) is not None or fallback.find(".//w:drawing", NAMESPACES) is not None ) if has_textbox: continue for text_elem in fallback.findall(".//w:t", NAMESPACES): elem_id = id(text_elem) if text_elem.text and elem_id not in processed_elem_ids: text_by_location[mc_idx] = text_elem.text ordered_runs.append(mc_idx) processed_elem_ids.add(elem_id) mc_idx += 1 # Sort the runs to maintain document order ordered_runs.sort() # Get raw text parts text_parts = [text_by_location[idx] for idx in ordered_runs] # Post-processing step: fix text box duplication where identical text appears consecutively # This handles cases where Word stores the same text multiple times in the XML processed_text = [] i = 0 while i < len(text_parts): # Start with the current text segment current_segment = text_parts[i] # Check if the same text is immediately repeated (common in text boxes) j = i + 1 while j < len(text_parts) and text_parts[j] == current_segment: # Skip consecutive identical segments j += 1 # Add the text segment once and skip all duplicates processed_text.append(current_segment) i = j return "".join(processed_text) except Exception as e: if strict_mode: raise DocxContentError( f"Error extracting paragraph text: {str(e)}" ) from e else: logger.warning(f"Error extracting paragraph text: {str(e)}") return "" def _is_text_box_paragraph(self, para_element: ET.Element) -> bool: """ Determines if a paragraph is from a text box. :param para_element: Paragraph XML element :return: True if the paragraph is part of a text box """ # Check for various types of text boxes in Word # 1. VML textbox (older Word format) if para_element.find(".//v:textbox", NAMESPACES) is not None: return True # 2. DrawingML text box (Office 2007+) if para_element.find(".//w:txbxContent", NAMESPACES) is not None: return True # 3. Check for shape with text if para_element.find(".//a:t", NAMESPACES) is not None: return True # 4. Check for drawing element if para_element.find(".//w:drawing", NAMESPACES) is not None: return True return False def _process_paragraph( self, para_element: ET.Element, package: DocxPackage, markdown_mode: bool = False, strict_mode: bool = False, include_textboxes: bool = True, ) -> Optional[str | Paragraph]: """ Processes a paragraph element and returns either a markdown string or Paragraph object. :param para_element: Paragraph XML element :param package: DocxPackage object :param markdown_mode: If True, return markdown formatted text, otherwise return a Paragraph object (default: False) :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: Markdown string, Paragraph object, or None if paragraph is empty """ try: # Check if this is a text box paragraph and we should skip it if not include_textboxes and self._is_text_box_paragraph(para_element): return None # Extract text content text = self._extract_paragraph_text( para_element, strict_mode=strict_mode, include_textboxes=include_textboxes, ).strip() if not text: return None # Get style information style_id = self._get_paragraph_style(para_element) style_name = self._get_style_name(style_id, package) style_info = f"Style: {style_name}" # Get list information is_list, list_level, _, list_type, is_numbered = self._get_list_info( para_element, package ) # Get footnote reference information footnote_info = "" footnote_ids = self._extract_footnote_references(para_element) if footnote_ids: footnote_info = f", Footnote References: {','.join(footnote_ids)}" # Get comment reference information comment_info = "" comment_ids = self._extract_comment_references(para_element) if comment_ids: comment_info = f", Comment References: {','.join(comment_ids)}" # Check if this is a text box paragraph text_box_info = "" if self._is_text_box_paragraph(para_element): text_box_info = ", Text Box" if markdown_mode: # Convert to markdown based on style and list status if style_name.lower().startswith("heading"): # Extract heading level (e.g., "Heading 1" -> 1) heading_level = 1 match = re.search(r"(\d+)", style_name) if match: heading_level = int(match.group(1)) return "#" * heading_level + " " + text elif is_list: # Add indentation based on list level indent = " " * list_level # Use the appropriate list marker based on list type if is_numbered: # For numbered lists, use "1. " format # Note: Markdown doesn't support different numbered formats, # but it will render as a numbered list return f"{indent}1. {text}" else: # For bullet lists, use "- " format return f"{indent}- {text}" else: # Regular paragraph return text else: # Return a Paragraph instance with metadata metadata = style_info # Add list information with more details if is_list: list_type_info = "Numbered" if is_numbered else "Bullet" metadata += f", List Type: {list_type_info}, Level: {list_level}" if list_type: metadata += f", Format: {list_type}" # Extract List ID from original _get_list_info results p_pr = para_element.find("w:pPr", NAMESPACES) if p_pr is not None: num_pr = p_pr.find("w:numPr", NAMESPACES) if num_pr is not None: num_id_elem = num_pr.find("w:numId", NAMESPACES) if num_id_elem is not None: list_id = num_id_elem.attrib[ f"{{{NAMESPACES['w']}}}val" ] metadata += f", List ID: {list_id}" metadata += footnote_info + comment_info + text_box_info return Paragraph(raw_text=text, additional_context=metadata) except DocxXmlError: # Re-raise specific XML errors raise except Exception as e: if strict_mode: raise DocxContentError(f"Error processing paragraph: {str(e)}") from e else: logger.warning(f"Error processing paragraph: {str(e)}") return None def _process_table( self, table_element: ET.Element, package: DocxPackage, markdown_mode: bool = False, table_idx: int = 0, strict_mode: bool = False, include_textboxes: bool = True, ) -> list[str | Paragraph]: """ Processes a table element and returns either paragraphs or markdown lines. :param table_element: Table XML element :param package: DocxPackage object :param markdown_mode: If True, return markdown formatted lines, otherwise return Paragraph objects (default: False) :param table_idx: Index of the table in the document (default: 0) :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: List of markdown lines or Paragraph objects """ result = [] try: if markdown_mode: # Process table for markdown output rows = table_element.findall(".//w:tr", NAMESPACES) if not rows: return result # Collect all cell data and determine column widths all_rows = [] col_widths = [] for row in rows: row_cells = [] for cell in row.findall(".//w:tc", NAMESPACES): # Combine all text from paragraphs in the cell cell_text = [] for para in cell.findall(".//w:p", NAMESPACES): # Process paragraph processed_para = self._process_paragraph( para, package, True, strict_mode, include_textboxes ) if processed_para: cell_text.append(processed_para) cell_content = " ".join(cell_text).strip() or " " row_cells.append(cell_content) all_rows.append(row_cells) # Update max widths if not col_widths: col_widths = [len(cell) for cell in row_cells] else: for i, cell in enumerate(row_cells): if i < len(col_widths): col_widths[i] = max(col_widths[i], len(cell)) # Format the table as markdown for row_idx, row_cells in enumerate(all_rows): # Pad cells for alignment padded_cells = [] for i, cell in enumerate(row_cells): if i < len(col_widths): padded_cells.append(cell.ljust(col_widths[i])) else: padded_cells.append(cell) result.append("| " + " | ".join(padded_cells) + " |") # Add header separator after first row if row_idx == 0: separator = [] for width in col_widths[: len(row_cells)]: separator.append("-" * width) result.append("| " + " | ".join(separator) + " |") # Add blank line after table result.append("") else: # Process table for Paragraph objects table_metadata = f"Table: {table_idx+1}" rows = table_element.findall(".//w:tr", NAMESPACES) for row_idx, row in enumerate(rows): for cell_idx, cell in enumerate(row.findall(".//w:tc", NAMESPACES)): for para in cell.findall(".//w:p", NAMESPACES): # Process paragraph processed_para = self._process_paragraph( para, package, False, strict_mode, include_textboxes ) if processed_para: style_id = self._get_paragraph_style(para) style_name = self._get_style_name(style_id, package) cell_style_info = f"Style: {style_name}" # Copy the paragraph with added table metadata cell_para = Paragraph( raw_text=processed_para.raw_text, additional_context=f"{cell_style_info}, {table_metadata}, " f"Row: {row_idx+1}, Column: {cell_idx+1}, " f"Table Cell" + ( ", " + processed_para.additional_context.split( ", ", 1 )[1] if ", " in processed_para.additional_context else "" ), ) result.append(cell_para) return result except Exception as e: # Handle table parsing errors if isinstance(e, DocxConverterError): # Re-raise specific converter errors raise else: if strict_mode: raise DocxContentError(f"Error processing table: {str(e)}") from e else: logger.warning( f"Error processing table (idx: {table_idx}): {str(e)}" ) # Return whatever we've processed so far return result def _process_headers( self, package: DocxPackage, strict_mode: bool = False, include_textboxes: bool = True, ) -> list[Paragraph]: """ Processes headers from the header XML files and converts them to Paragraph objects. :param package: DocxPackage object :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: List of Paragraph objects representing headers """ header_paragraphs = [] if not package.headers: return header_paragraphs # Process each header for header_id, header_info in package.headers.items(): header_content = header_info["content"] # Process each paragraph in the header for para in header_content.findall( f".//{{{NAMESPACES['w']}}}p", NAMESPACES ): # Extract the text content para_text = self._extract_paragraph_text( para, strict_mode=strict_mode, include_textboxes=include_textboxes ).strip() if para_text: # Get paragraph style and metadata style_id = self._get_paragraph_style(para) style_name = self._get_style_name(style_id, package) # Build metadata header_info = f"Style: {style_name}, Header: {header_id}" # Create paragraph object with metadata header_paragraphs.append( Paragraph(raw_text=para_text, additional_context=header_info) ) return header_paragraphs def _process_footers( self, package: DocxPackage, strict_mode: bool = False, include_textboxes: bool = True, ) -> list[Paragraph]: """ Processes footers from the footer XML files and converts them to Paragraph objects. :param package: DocxPackage object :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :param include_textboxes: If True, include textbox content (default: True) :return: List of Paragraph objects representing footers """ footer_paragraphs = [] if not package.footers: return footer_paragraphs # Process each footer for footer_id, footer_info in package.footers.items(): footer_content = footer_info["content"] # Process each paragraph in the footer for para in footer_content.findall( f".//{{{NAMESPACES['w']}}}p", NAMESPACES ): # Extract the text content para_text = self._extract_paragraph_text( para, strict_mode=strict_mode, include_textboxes=include_textboxes ).strip() if para_text: # Get paragraph style and metadata style_id = self._get_paragraph_style(para) style_name = self._get_style_name(style_id, package) # Build metadata footer_info = f"Style: {style_name}, Footer: {footer_id}" # Create paragraph object with metadata footer_paragraphs.append( Paragraph(raw_text=para_text, additional_context=footer_info) ) return footer_paragraphs def _process_docx_elements( self, package: DocxPackage, markdown_mode: bool = False, include_tables: bool = True, include_comments: bool = True, include_footnotes: bool = True, include_headers: bool = True, include_footers: bool = True, include_textboxes: bool = True, strict_mode: bool = False, ) -> list[str | Paragraph]: """ Processes all elements in the DOCX document and returns appropriate objects. :param package: DocxPackage object :param markdown_mode: If True, return markdown formatted lines, otherwise return objects (default: False) :param include_tables: If True, include tables in the output (default: True) :param include_comments: If True, include comments in the output (default: True) :param include_footnotes: If True, include footnotes in the output (default: True) :param include_headers: If True, include headers in the output (default: True) :param include_footers: If True, include footers in the output (default: True) :param include_textboxes: If True, include textbox content (default: True) :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :return: List of markdown lines or Paragraph objects """ result = [] if package.main_document is None: raise DocxContentError("Main document content is missing") try: # Get the body element body = package.main_document.find( f".//{{{NAMESPACES['w']}}}body", NAMESPACES ) if body is None: raise DocxContentError("Document body element is missing") # Process headers if include_headers: try: header_paragraphs = self._process_headers( package, strict_mode=strict_mode, include_textboxes=include_textboxes, ) if markdown_mode and header_paragraphs: for para in header_paragraphs: # Add clear Header marker result.append(f"**Header**: {para.raw_text}") result.append("") else: # For object mode, add headers at the beginning result.extend(header_paragraphs) except Exception as e: # In strict mode, re-raise as DocxContentError if strict_mode: raise DocxContentError( f"Error processing headers: {str(e)}" ) from e # Otherwise, log error and continue without headers logger.warning(f"Error processing headers: {str(e)}") # Track tables for indexing table_count = 0 # Track numbered lists for proper sequencing in markdown mode list_counters = {} # {(list_id, level): counter} last_list_id = None last_list_level = -1 # Process each element in order for element in body: tag = element.tag.split("}")[-1] # Remove namespace prefix if tag == "p": # Process paragraph try: # Before processing, check if this is a list item that needs # special handling for markdown if markdown_mode: # Extract list information p_pr = element.find("w:pPr", NAMESPACES) if p_pr is not None: num_pr = p_pr.find("w:numPr", NAMESPACES) if num_pr is not None: # This is a list item _, list_level, __, ___, is_numbered = ( self._get_list_info(element, package) ) # Get num_id for this list item num_id_elem = num_pr.find("w:numId", NAMESPACES) if num_id_elem is not None: num_id = num_id_elem.attrib[ f"{{{NAMESPACES['w']}}}val" ] # If it's a numbered list, we need to track counter if is_numbered: list_key = (num_id, list_level) # Reset counter if this is a new list or a higher # level in the same list if ( last_list_id != num_id or list_level < last_list_level ): # Reset counters for all levels below # the current level for key in list(list_counters.keys()): if ( key[0] == num_id and key[1] > list_level ): list_counters.pop(key) # Initialize counter if needed if list_key not in list_counters: list_counters[list_key] = 1 else: list_counters[list_key] += 1 # Remember this list for next iteration last_list_id = num_id last_list_level = list_level # Now extract paragraph text to build the markdown text = self._extract_paragraph_text( element, strict_mode=strict_mode, include_textboxes=include_textboxes, ).strip() if text: # Add indentation based on list level indent = " " * list_level # Use actual number from counter result.append( f"{indent}{list_counters[list_key]}. {text}" ) result.append("") # Add blank line continue # Skip normal processing # Regular processing for non-numbered lists or non-markdown mode processed_para = self._process_paragraph( element, package, markdown_mode, strict_mode, include_textboxes, ) if processed_para is not None: result.append(processed_para) # Add blank line after paragraphs in markdown mode if markdown_mode: result.append("") except Exception as e: if strict_mode: # In strict mode, re-raise as DocxContentError raise DocxContentError( f"Error processing paragraph: {str(e)}" ) # Log error and continue with next paragraph logger.warning(f"Error processing paragraph: {str(e)}") elif tag == "tbl" and include_tables: # Process table try: table_items = self._process_table( element, package, markdown_mode, table_count, strict_mode, include_textboxes, ) result.extend(table_items) table_count += 1 except Exception as e: if strict_mode: # In strict mode, re-raise as DocxContentError raise DocxContentError( f"Error processing table: {str(e)}" ) from e # Log error and continue with next element logger.warning(f"Error processing table: {str(e)}") # Process footnotes and add them as regular paragraphs if include_footnotes and package.footnotes is not None: try: footnote_paragraphs = self._process_footnotes( package, strict_mode=strict_mode, include_textboxes=include_textboxes, ) if markdown_mode and footnote_paragraphs: # Add each footnote as markdown text for para in footnote_paragraphs: footnote_id = para.additional_context.split("Footnote: ")[ 1 ].split(",")[0] result.append( f"**Footnote {footnote_id}**: {para.raw_text}" ) result.append("") else: # For object mode, just add footnotes as paragraphs result.extend(footnote_paragraphs) except Exception as e: if strict_mode: # In strict mode, re-raise as DocxContentError raise DocxContentError( f"Error processing footnotes: {str(e)}" ) from e # Log error and continue without footnotes logger.warning(f"Error processing footnotes: {str(e)}") # Process comments and add them as regular paragraphs if include_comments and package.comments is not None: try: comment_paragraphs = self._process_comments( package, strict_mode=strict_mode, include_textboxes=include_textboxes, ) if markdown_mode and comment_paragraphs: # Add each comment as markdown text for para in comment_paragraphs: if "Comment:" in para.additional_context: # Extract comment ID from additional_context comment_id = para.additional_context.split("Comment: ")[ 1 ].split(",")[0] # Extract author if present author = "" if "Author: " in para.additional_context: author = para.additional_context.split("Author: ")[ 1 ].split(",")[0] author = f" (by {author})" result.append( f"**Comment {comment_id}{author}**: {para.raw_text}" ) result.append("") else: # For object mode, just add comments as paragraphs result.extend(comment_paragraphs) except Exception as e: if strict_mode: # In strict mode, re-raise as DocxContentError raise DocxContentError( f"Error processing comments: {str(e)}" ) from e # Log error and continue without comments logger.warning(f"Error processing comments: {str(e)}") # Process footers if include_footers and package.footers: try: footer_paragraphs = self._process_footers( package, strict_mode=strict_mode, include_textboxes=include_textboxes, ) if markdown_mode and footer_paragraphs: for para in footer_paragraphs: # Add clear Footer marker result.append(f"**Footer**: {para.raw_text}") result.append("") else: # For object mode, add footers at the end result.extend(footer_paragraphs) except Exception as e: if strict_mode: # In strict mode, re-raise as DocxContentError raise DocxContentError( f"Error processing footers: {str(e)}" ) from e # Log error and continue without footers logger.warning(f"Error processing footers: {str(e)}") return result except DocxConverterError: # Re-raise specific converter errors raise except Exception as e: # Handle general errors in document processing raise DocxXmlError(f"Error processing document elements: {str(e)}") from e def _extract_images( self, package: DocxPackage, strict_mode: bool = False ) -> list[Image]: """ Extracts images from the DOCX document. :param package: DocxPackage object :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :return: List of Image objects """ images = [] img_count = 0 error_count = 0 try: logger.debug( f"Extracting images from DOCX (found {len(package.images)} images)" ) for rel_id, image_info in package.images.items(): # Get image data and mime type image_bytes = image_info["data"] mime_type = image_info["mime_type"] # Ensure mime type is supported if mime_type not in { "image/jpg", "image/jpeg", "image/png", "image/webp", }: # Default to PNG if unsupported logger.debug( f"Unsupported image MIME type: {mime_type}, defaulting to image/png" ) mime_type = "image/png" try: # Convert to base64 b64_data = base64.b64encode(image_bytes).decode("utf-8") img_instance = Image(base64_data=b64_data, mime_type=mime_type) images.append(img_instance) img_count += 1 except Exception as e: # If in strict mode, raise the error if strict_mode: raise DocxContentError( f"Error converting image '{image_info.get('target', rel_id)}': {str(e)}" ) from e # Otherwise log the error and continue with the next image error_count += 1 logger.warning( f"Error converting image '{image_info.get('target', rel_id)}': {str(e)}" ) continue if img_count > 0: logger.info(f"Successfully extracted {img_count} images from DOCX") if error_count > 0: logger.warning(f"Failed to extract {error_count} images from DOCX") return images except Exception as e: # Handle critical errors extracting images raise DocxConverterError( f"Error extracting images from DOCX: {str(e)}" ) from e
[docs] def convert_to_text_format( self, docx_path_or_file: str | Path | BinaryIO, output_format: RawTextMode = "markdown", include_tables: bool = True, include_comments: bool = True, include_footnotes: bool = True, include_headers: bool = True, include_footers: bool = True, include_textboxes: bool = True, strict_mode: bool = False, ) -> str: """ Converts a DOCX file directly to text without creating a ContextGem Document. :param docx_path_or_file: Path to the DOCX file (as string or Path object) or a file-like object :param output_format: Output format ("markdown" or "raw") (default: "markdown") :param include_tables: If True, include tables in the output (default: True) :param include_comments: If True, include comments in the output (default: True) :param include_footnotes: If True, include footnotes in the output (default: True) :param include_headers: If True, include headers in the output (default: True) :param include_footers: If True, include footers in the output (default: True) :param include_textboxes: If True, include textbox content (default: True) :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :return: Text in the specified format .. note:: When using markdown output format, the following conditions apply: * Document structure elements (headings, lists, tables) are preserved * Character-level formatting (bold, italic, underline) is intentionally skipped to ensure proper text matching between markdown and DOCX content * Headings are converted to markdown heading syntax (# Heading 1, ## Heading 2, etc.) * Lists are converted to markdown list syntax, preserving numbering and hierarchy * Tables are formatted using markdown table syntax * Footnotes, comments, headers, and footers are included as specially marked sections """ package = None try: package = DocxPackage(docx_path_or_file) if output_format.lower() == "markdown": # Process document elements into markdown lines markdown_lines = self._process_docx_elements( package, markdown_mode=True, include_tables=include_tables, include_comments=include_comments, include_footnotes=include_footnotes, include_headers=include_headers, include_footers=include_footers, include_textboxes=include_textboxes, strict_mode=strict_mode, ) # Join all lines and return as a single string return "\n".join(markdown_lines) elif output_format.lower() == "raw": # Process document elements paragraphs = self._process_docx_elements( package, markdown_mode=False, include_tables=include_tables, include_comments=include_comments, include_footnotes=include_footnotes, include_headers=include_headers, include_footers=include_footers, include_textboxes=include_textboxes, strict_mode=strict_mode, ) # Combine all paragraph texts return "\n\n".join(para.raw_text for para in paragraphs) else: raise DocxConverterError(f"Invalid output format: {output_format}") except DocxConverterError: # Re-raise specific converter errors raise except Exception as e: # Convert generic exceptions to DocxConverterError logger.error(f"Error converting DOCX to {output_format}: {str(e)}") raise DocxConverterError( f"Error converting DOCX to {output_format}: {str(e)}" ) from e finally: # Ensure the package is closed even if an exception occurs if package: package.close()
[docs] def convert( self, docx_path_or_file: str | Path | BinaryIO, raw_text_to_md: bool = True, include_tables: bool = True, include_comments: bool = True, include_footnotes: bool = True, include_headers: bool = True, include_footers: bool = True, include_textboxes: bool = True, include_images: bool = True, strict_mode: bool = False, ) -> Document: """ Converts a DOCX file into a ContextGem Document object. :param docx_path_or_file: Path to the DOCX file (as string or Path object) or a file-like object :param raw_text_to_md: If True, convert raw text to markdown (default: True) :param include_tables: If True, include tables in the output (default: True) :param include_comments: If True, include comments in the output (default: True) :param include_footnotes: If True, include footnotes in the output (default: True) :param include_headers: If True, include headers in the output (default: True) :param include_footers: If True, include footers in the output (default: True) :param include_textboxes: If True, include textbox content (default: True) :param include_images: If True, extract and include images (default: True) :param strict_mode: If True, raise exceptions for any processing error instead of skipping problematic elements (default: False) :return: A populated Document object """ package = None try: # Get file name or descriptor for logging file_desc = ( docx_path_or_file if isinstance(docx_path_or_file, (str, Path)) else "file object" ) logger.info(f"Converting DOCX: {file_desc} (strict mode: {strict_mode})") # Create DocxPackage package = DocxPackage(docx_path_or_file) # Process document elements and get paragraphs logger.debug("Processing document elements") paragraphs = self._process_docx_elements( package, markdown_mode=False, include_tables=include_tables, include_comments=include_comments, include_footnotes=include_footnotes, include_headers=include_headers, include_footers=include_footers, include_textboxes=include_textboxes, strict_mode=strict_mode, ) logger.debug(f"Extracted {len(paragraphs)} paragraphs") # Generate text representation based on the flag output_format = "markdown" if raw_text_to_md else "raw" logger.debug(f"Converting to {output_format} format") text = self.convert_to_text_format( docx_path_or_file, output_format=output_format, include_tables=include_tables, include_comments=include_comments, include_footnotes=include_footnotes, include_headers=include_headers, include_footers=include_footers, include_textboxes=include_textboxes, strict_mode=strict_mode, ) # Initialize the ContextGem Document logger.debug("Creating Document object") context_doc = Document(raw_text=text, paragraphs=paragraphs) # Process images from DOCX if requested if include_images: logger.debug("Processing images") images = self._extract_images(package, strict_mode=strict_mode) # Attach images to the document context_doc.images = images logger.debug(f"Added {len(images)} images to document") logger.info( f"DOCX conversion completed successfully: {len(paragraphs)} paragraphs, " f"{len(context_doc.images) if include_images else 0} images" ) return context_doc except DocxConverterError: # Re-raise specific converter errors raise except Exception as e: # Catch any other exceptions and convert to DocxConverterError logger.error(f"Error converting DOCX file: {str(e)}") raise DocxConverterError(f"Error converting DOCX file: {str(e)}") from e finally: # Ensure the package is closed even if an exception occurs if package: package.close()