Advanced usage examples

Advanced usage examples#

Below are complete, self-contained examples demonstrating advanced usage of ContextGem.

🔍 Extracting Aspects Containing Concepts#

Tip

Concept extraction is useful for extracting specific data points from a document or an aspect. For example, a “Payment terms” aspect in a contract may have multiple concepts:

“Payment amount”
“Payment due date”
“Payment method”

# Advanced Usage Example - extracting a single aspect with inner concepts from a legal document

import os

from contextgem import Aspect, Document, DocumentLLM, StringConcept, StringExample


# Create a document instance with e.g. a legal contract text
# The text is shortened for brevity
doc = Document(
    raw_text=(
        "EMPLOYMENT AGREEMENT\n\n"
        'This Employment Agreement (the "Agreement") is made and entered into as of January 15, 2023 (the "Effective Date"), '
        'by and between ABC Corporation, a Delaware corporation (the "Company"), and Jane Smith, an individual (the "Employee").\n\n'
        "1. EMPLOYMENT TERM\n"
        "The Company hereby employs the Employee, and the Employee hereby accepts employment with the Company, upon the terms and "
        "conditions set forth in this Agreement. The term of this Agreement shall commence on the Effective Date and shall continue "
        'for a period of two (2) years, unless earlier terminated in accordance with Section 8 (the "Term").\n\n'
        "2. POSITION AND DUTIES\n"
        "During the Term, the Employee shall serve as Chief Technology Officer of the Company, with such duties and responsibilities "
        "as are commensurate with such position.\n\n"
        "8. TERMINATION\n"
        "8.1 Termination by the Company. The Company may terminate the Employee's employment for Cause at any time upon written notice. "
        "\"Cause\" shall mean: (i) Employee's material breach of this Agreement; (ii) Employee's conviction of a felony; or "
        "(iii) Employee's willful misconduct that causes material harm to the Company.\n"
        "8.2 Termination by the Employee. The Employee may terminate employment for Good Reason upon 30 days' written notice to the Company. "
        "\"Good Reason\" shall mean a material reduction in Employee's base salary or a material diminution in Employee's duties.\n"
        "8.3 Severance. If the Employee's employment is terminated by the Company without Cause or by the Employee for Good Reason, "
        "the Employee shall be entitled to receive severance pay equal to six (6) months of the Employee's base salary.\n\n"
        "IN WITNESS WHEREOF, the parties have executed this Agreement as of the date first written above.\n\n"
        "ABC CORPORATION\n\n"
        "By: ______________________\n"
        "Name: John Johnson\n"
        "Title: CEO\n\n"
        "EMPLOYEE\n\n"
        "______________________\n"
        "Jane Smith"
    )
)

# Define an aspect focused on termination clauses
termination_aspect = Aspect(
    name="Termination Provisions",
    description="Analysis of contract termination conditions, notice requirements, and severance terms.",
    reference_depth="paragraphs",
)

# Define concepts for the termination aspect
termination_for_cause = StringConcept(
    name="Termination for Cause",
    description="Conditions under which the company can terminate the employee for cause.",
    examples=[  # optional, examples help the LLM to understand the concept better
        StringExample(content="Employee may be terminated for misconduct"),
        StringExample(content="Termination for breach of contract"),
    ],
    add_references=True,
    reference_depth="sentences",
)
notice_period = StringConcept(
    name="Notice Period",
    description="Required notification period before employment termination.",
    add_references=True,
    reference_depth="sentences",
)
severance_terms = StringConcept(
    name="Severance Package",
    description="Compensation and benefits provided upon termination.",
    add_references=True,
    reference_depth="sentences",
)

# Add concepts to the aspect
termination_aspect.add_concepts([termination_for_cause, notice_period, severance_terms])

# Add the aspect to the document
doc.add_aspects([termination_aspect])

# Create an LLM for extracting data from the document
llm = DocumentLLM(
    model="openai/gpt-4o",  # You can use models from other providers as well, e.g. "anthropic/claude-3-5-sonnet"
    api_key=os.environ.get(
        "CONTEXTGEM_OPENAI_API_KEY"
    ),  # your API key for OpenAI or another LLM provider
)

# Extract all information from the document
doc = llm.extract_all(doc)

# Access the extracted information in the document object
print("=== Termination Provisions Analysis ===")
print(f"Extracted {len(doc.aspects[0].extracted_items)} items from the aspect")

# Access extracted aspect concepts in the document object
for concept in doc.aspects[0].concepts:
    print(f"--- {concept.name} ---")
    for item in concept.extracted_items:
        print(f"• {item.value}")
        print(f"  Reference sentences: {len(item.reference_sentences)}")

📊 Extracting Aspects and Concepts from a Document#

Tip

This example demonstrates how to extract both document-level concepts and aspect-specific concepts from a document with references. Using concurrency can significantly speed up extraction when working with multiple aspects and concepts.

Document-level concepts apply to the entire document (like “Is Privacy Policy” or “Last Updated Date”), while aspect-specific concepts are tied to particular sections or themes within the document.

# Advanced Usage Example - Extracting aspects and concepts from a document, with references,
# using concurrency

import os

from aiolimiter import AsyncLimiter

from contextgem import (
    Aspect,
    BooleanConcept,
    DateConcept,
    Document,
    DocumentLLM,
    JsonObjectConcept,
    StringConcept,
)


# Example privacy policy document (shortened for brevity)
doc = Document(
    raw_text=(
        "Privacy Policy\n\n"
        "Last Updated: March 15, 2024\n\n"
        "1. Data Collection\n"
        "We collect various types of information from our users, including:\n"
        "- Personal information (name, email address, phone number)\n"
        "- Device information (IP address, browser type, operating system)\n"
        "- Usage data (pages visited, time spent on site)\n"
        "- Location data (with your consent)\n\n"
        "2. Data Usage\n"
        "We use your information to:\n"
        "- Provide and improve our services\n"
        "- Send you marketing communications (if you opt-in)\n"
        "- Analyze website performance\n"
        "- Comply with legal obligations\n\n"
        "3. Data Sharing\n"
        "We may share your information with:\n"
        "- Service providers (for processing payments and analytics)\n"
        "- Law enforcement (when legally required)\n"
        "- Business partners (with your explicit consent)\n\n"
        "4. Data Retention\n"
        "We retain personal data for 24 months after your last interaction with our services. "
        "Analytics data is kept for 36 months.\n\n"
        "5. User Rights\n"
        "You have the right to:\n"
        "- Access your personal data\n"
        "- Request data deletion\n"
        "- Opt-out of marketing communications\n"
        "- Lodge a complaint with supervisory authorities\n\n"
        "6. Contact Information\n"
        "For privacy-related inquiries, contact our Data Protection Officer at privacy@example.com\n"
    ),
)

# Define all document-level concepts in a single declaration
document_concepts = [
    BooleanConcept(
        name="Is Privacy Policy",
        description="Verify if this document is a privacy policy",
        singular_occurrence=True,  # explicitly enforce singular extracted item (optional)
    ),
    DateConcept(
        name="Last Updated Date",
        description="The date when the privacy policy was last updated",
        singular_occurrence=True,  # explicitly enforce singular extracted item (optional)
    ),
    StringConcept(
        name="Contact Information",
        description="Contact details for privacy-related inquiries",
        add_references=True,
        reference_depth="sentences",
    ),
]

# Define all aspects with their concepts in a single declaration
aspects = [
    Aspect(
        name="Data Collection",
        description="Information about what types of data are collected from users",
        concepts=[
            JsonObjectConcept(
                name="Collected Data Types",
                description="List of different types of data collected from users",
                structure={
                    "personal_info": list[str],
                    "technical_info": list[str],
                    "usage_info": list[str],
                },  # simply use a dictionary with type hints (including generic aliases and union types)
                add_references=True,
                reference_depth="sentences",
            )
        ],
    ),
    Aspect(
        name="Data Retention",
        description="Information about how long different types of data are retained",
        concepts=[
            JsonObjectConcept(
                name="Retention Periods",
                description="The durations for which different types of data are retained",
                structure={
                    "personal_info": str | None,
                    "technical_info": str | None,
                    "usage_info": str | None,
                },  # use `str | None` type hints to allow for None values if not specified
                add_references=True,
                reference_depth="sentences",
                singular_occurrence=True,  # explicitly enforce singular extracted item (optional)
            )
        ],
    ),
    Aspect(
        name="Data Subject Rights",
        description="Information about the rights users have regarding their data",
        concepts=[
            StringConcept(
                name="Data Subject Rights",
                description="Rights available to users regarding their personal data",
                add_references=True,
                reference_depth="sentences",
            )
        ],
    ),
]

# Add aspects and concepts to the document
doc.add_aspects(aspects)
doc.add_concepts(document_concepts)

# Create an LLM for extraction
llm = DocumentLLM(
    model="openai/gpt-4o",  # or another LLM from e.g. Anthropic, Ollama, etc.
    api_key=os.environ.get(
        "CONTEXTGEM_OPENAI_API_KEY"
    ),  # your API key for the applicable LLM provider
)
llm.async_limiter = AsyncLimiter(
    3, 3
)  # customize async limiter for concurrency (optional)


# Extract all information from the document, using concurrency
doc = llm.extract_all(doc, use_concurrency=True)

# Access / print extracted information on the document object

print("Document Concepts:")
for concept in doc.concepts:
    print(f"{concept.name}:")
    for item in concept.extracted_items:
        print(f"• {item.value}")
    print()

print("Aspects and Concepts:")
for aspect in doc.aspects:
    print(f"[{aspect.name}]")
    for item in aspect.extracted_items:
        print(f"• {item.value}")
    print()
    for concept in aspect.concepts:
        print(f"{concept.name}:")
        for item in concept.extracted_items:
            print(f"• {item.value}")
    print()

🔄 Using a Multi-LLM Pipeline to Extract Data from Several Documents#

Tip

A pipeline is a reusable configuration of extraction steps. You can use the same pipeline to extract data from multiple documents.

For example, if your app extracts data from invoices, you can configure a pipeline once, and then use it for each incoming invoice.

# Advanced Usage Example - analyzing multiple documents with a single pipeline,
# with different LLMs, concurrency and cost tracking

import os

from contextgem import (
    Aspect,
    DateConcept,
    Document,
    DocumentLLM,
    DocumentLLMGroup,
    ExtractionPipeline,
    JsonObjectConcept,
    JsonObjectExample,
    LLMPricing,
    NumericalConcept,
    RatingConcept,
    StringConcept,
    StringExample,
)


# Construct documents

# Document 1 - Consultancy Agreement (shortened for brevity)
doc1 = Document(
    raw_text=(
        "Consultancy Agreement\n"
        "This agreement between Company A (Supplier) and Company B (Customer)...\n"
        "The term of the agreement is 1 year from the Effective Date...\n"
        "The Supplier shall provide consultancy services as described in Annex 2...\n"
        "The Customer shall pay the Supplier within 30 calendar days of receiving an invoice...\n"
        "All intellectual property created during the provision of services shall belong to the Customer...\n"
        "This agreement is governed by the laws of Norway...\n"
        "Annex 1: Data processing agreement...\n"
        "Annex 2: Statement of Work...\n"
        "Annex 3: Service Level Agreement...\n"
    ),
)

# Document 2 - Service Level Agreement (shortened for brevity)
doc2 = Document(
    raw_text=(
        "Service Level Agreement\n"
        "This agreement between TechCorp (Provider) and GlobalInc (Client)...\n"
        "The agreement shall commence on January 1, 2023 and continue for 2 years...\n"
        "The Provider shall deliver IT support services as outlined in Schedule A...\n"
        "The Client shall make monthly payments of $5,000 within 15 days of invoice receipt...\n"
        "The Provider guarantees [99.9%] uptime for all critical systems...\n"
        "Either party may terminate with 60 days written notice...\n"
        "This agreement is governed by the laws of California...\n"
        "Schedule A: Service Descriptions...\n"
        "Schedule B: Response Time Requirements...\n"
    ),
)

# Create a reusable extraction pipeline
contract_pipeline = ExtractionPipeline()

# Define aspects and aspect-level concepts in the pipeline
# Concepts in the aspects will be extracted from the extracted aspect context
contract_pipeline.aspects = [  # or use .add_aspects([...])
    Aspect(
        name="Contract Parties",
        description="Clauses defining the parties to the agreement",
        concepts=[  # define aspect-level concepts, if any
            StringConcept(
                name="Party names and roles",
                description="Names of all parties entering into the agreement and their roles",
                examples=[  # optional
                    StringExample(
                        content="X (Client)",  # guidance regarding the expected output format
                    )
                ],
            )
        ],
    ),
    Aspect(
        name="Term",
        description="Clauses defining the term of the agreement",
        concepts=[
            NumericalConcept(
                name="Contract term",
                description="The term of the agreement in years",
                numeric_type="int",  # or "float", or "any" for auto-detection
                add_references=True,  # extract references to the source text
                reference_depth="paragraphs",
            )
        ],
    ),
]

# Define document-level concepts
# Concepts in the document will be extracted from the whole document content
contract_pipeline.concepts = [  # or use .add_concepts()
    DateConcept(
        name="Effective date",
        description="The effective date of the agreement",
    ),
    StringConcept(
        name="Contract type",
        description="The type of agreement",
        llm_role="reasoner_text",  # for this concept, we use a more advanced LLM for reasoning
    ),
    StringConcept(
        name="Governing law",
        description="The law that governs the agreement",
    ),
    JsonObjectConcept(
        name="Attachments",
        description="The titles and concise descriptions of the attachments to the agreement",
        structure={"title": str, "description": str | None},
        examples=[  # optional
            JsonObjectExample(  # guidance regarding the expected output format
                content={
                    "title": "Appendix A",
                    "description": "Code of conduct",
                }
            ),
        ],
    ),
    RatingConcept(
        name="Duration adequacy",
        description="Contract duration adequacy considering the subject matter and best practices.",
        llm_role="reasoner_text",  # for this concept, we use a more advanced LLM for reasoning
        rating_scale=(1, 10),
        add_justifications=True,  # add justifications for the rating
        justification_depth="balanced",  # provide a balanced justification
        justification_max_sents=3,
    ),
]

# Assign pipeline to the documents
# You can re-use the same pipeline for multiple documents
doc1.assign_pipeline(
    contract_pipeline
)  # assigns pipeline aspects and concepts to the document
doc2.assign_pipeline(
    contract_pipeline
)  # assigns pipeline aspects and concepts to the document

# Create an LLM group for data extraction and reasoning
llm_extractor = DocumentLLM(
    model="openai/gpt-4o-mini",  # or any other LLM from e.g. Anthropic, etc.
    api_key=os.environ["CONTEXTGEM_OPENAI_API_KEY"],  # your API key
    role="extractor_text",  # signifies the LLM is used for data extraction tasks
    pricing_details=LLMPricing(  # optional, for costs calculation
        input_per_1m_tokens=0.150,
        output_per_1m_tokens=0.600,
    ),
    # or set `auto_pricing=True` to automatically fetch pricing data from the LLM provider
)
llm_reasoner = DocumentLLM(
    model="openai/o3-mini",  # or any other LLM from e.g. Anthropic, etc.
    api_key=os.environ["CONTEXTGEM_OPENAI_API_KEY"],  # your API key
    role="reasoner_text",  # signifies the LLM is used for reasoning tasks
    pricing_details=LLMPricing(  # optional, for costs calculation
        input_per_1m_tokens=1.10,
        output_per_1m_tokens=4.40,
    ),
    # or set `auto_pricing=True` to automatically fetch pricing data from the LLM provider
)
# The LLM group is used for all extraction tasks within the pipeline
llm_group = DocumentLLMGroup(llms=[llm_extractor, llm_reasoner])

# Extract all information from the documents at once
doc1 = llm_group.extract_all(
    doc1, use_concurrency=True
)  # use concurrency to speed up extraction
doc2 = llm_group.extract_all(
    doc2, use_concurrency=True
)  # use concurrency to speed up extraction
# Or use async variants .extract_all_async(...)

# Get the extracted data
print("Some extracted data from doc 1:")
print("Contract Parties > Party names and roles:")
print(
    doc1.get_aspect_by_name("Contract Parties")
    .get_concept_by_name("Party names and roles")
    .extracted_items
)
print("Attachments:")
print(doc1.get_concept_by_name("Attachments").extracted_items)
# ...

print("\nSome extracted data from doc 2:")
print("Term > Contract term:")
print(
    doc2.get_aspect_by_name("Term")
    .get_concept_by_name("Contract term")
    .extracted_items[0]
    .value
)
print("Duration adequacy:")
print(doc2.get_concept_by_name("Duration adequacy").extracted_items[0].value)
print(doc2.get_concept_by_name("Duration adequacy").extracted_items[0].justification)
# ...

# Output processing costs (requires setting the pricing details for each LLM)
print("\nProcessing costs:")
print(llm_group.get_cost())

Advanced usage examples

Contents

Advanced usage examples#

🔍 Extracting Aspects Containing Concepts#

📊 Extracting Aspects and Concepts from a Document#

🔄 Using a Multi-LLM Pipeline to Extract Data from Several Documents#