ai-functions
Version:
Core AI primitives for building intelligent applications
456 lines (370 loc) • 12.6 kB
text/typescript
/**
* Data Extraction from Documents Example
*
* This example demonstrates extracting structured data from various document types
* using ai-functions. It shows how to:
* - Extract entities from unstructured text
* - Parse specific document formats (invoices, resumes, contracts)
* - Handle tables and structured sections
* - Validate extracted data
*
* @example
* ```bash
* ANTHROPIC_API_KEY=sk-... npx tsx examples/05-document-extraction.ts
* ```
*/
import { ai, extract, list, is, configure, schema } from '../src/index.js'
// ============================================================================
// Sample Documents
// ============================================================================
const sampleInvoice = `
INVOICE #INV-2024-0042
Bill To: Ship To:
Acme Corporation Acme Corporation
123 Business Lane 456 Warehouse Rd
San Francisco, CA 94102 Oakland, CA 94612
Invoice Date: January 15, 2024
Due Date: February 14, 2024
Payment Terms: Net 30
| Item | Qty | Unit Price | Total |
|------------------------|-----|------------|------------|
| Widget Pro (WP-100) | 50 | $29.99 | $1,499.50 |
| Widget Basic (WB-50) | 100 | $14.99 | $1,499.00 |
| Premium Support (1yr) | 1 | $999.00 | $999.00 |
| Shipping & Handling | - | - | $45.00 |
Subtotal: $4,042.50
Tax (8.5%): $343.61
TOTAL DUE: $4,386.11
Payment Methods: Check, Wire Transfer, ACH
Bank: First National Bank
Account: 1234567890
Routing: 021000021
`
const sampleResume = `
SARAH CHEN
Software Engineer | AI/ML Specialist
san.chen@email.com | (555) 123-4567 | linkedin.com/in/sarahchen | github.com/schen
SUMMARY
Experienced software engineer with 6+ years building ML systems at scale.
Passionate about NLP and computer vision applications.
EXPERIENCE
Senior Machine Learning Engineer | TechCorp Inc. | Jan 2021 - Present
- Led team of 5 engineers building recommendation system serving 10M users
- Reduced model inference latency by 40% through optimization
- Implemented A/B testing framework for ML experiments
- Technologies: Python, TensorFlow, Kubernetes, AWS SageMaker
Software Engineer | StartupAI | June 2018 - Dec 2020
- Built NLP pipeline processing 1M documents daily
- Developed real-time fraud detection system (99.2% accuracy)
- Created data labeling tools that reduced annotation time by 60%
EDUCATION
M.S. Computer Science | Stanford University | 2018
- Focus: Machine Learning, Natural Language Processing
- GPA: 3.9/4.0
B.S. Computer Science | UC Berkeley | 2016
- Summa Cum Laude
- Dean's List all semesters
SKILLS
Languages: Python, TypeScript, Go, SQL
ML/AI: TensorFlow, PyTorch, scikit-learn, Hugging Face
Cloud: AWS, GCP, Kubernetes, Docker
Other: Git, CI/CD, Agile/Scrum
CERTIFICATIONS
- AWS Certified Machine Learning Specialty (2023)
- Google Professional ML Engineer (2022)
`
const sampleContract = `
SERVICE AGREEMENT
This Service Agreement ("Agreement") is entered into as of March 1, 2024
("Effective Date") by and between:
Provider: CloudServices LLC
Address: 789 Tech Blvd, Austin, TX 78701
Contact: contracts@cloudservices.com
Client: RetailCo Inc.
Address: 321 Commerce St, Dallas, TX 75201
Contact: legal@retailco.com
1. SERVICES
Provider agrees to provide cloud hosting services including:
- Dedicated server instances (4x 32-core, 128GB RAM)
- 99.9% uptime SLA
- 24/7 technical support
- Daily automated backups
2. TERM
Initial term: 24 months
Auto-renewal: Yes, 12-month periods unless cancelled with 60 days notice
3. FEES
Monthly fee: $8,500
Annual total: $102,000
Payment due: Net 15 from invoice date
4. TERMINATION
Either party may terminate with 60 days written notice.
Early termination fee: 3 months of fees.
5. CONFIDENTIALITY
Both parties agree to maintain confidentiality of proprietary information.
Duration: 3 years after termination.
Signed:
_____________________ _____________________
CloudServices LLC RetailCo Inc.
Date: March 1, 2024 Date: March 1, 2024
`
// ============================================================================
// Extraction Schemas
// ============================================================================
interface InvoiceData {
invoiceNumber: string
invoiceDate: string
dueDate: string
billTo: {
company: string
address: string
}
shipTo: {
company: string
address: string
}
lineItems: {
description: string
quantity: number
unitPrice: number
total: number
}[]
subtotal: number
tax: number
total: number
paymentTerms: string
}
interface ResumeData {
name: string
title: string
contact: {
email: string
phone: string
linkedin?: string
github?: string
}
summary: string
experience: {
title: string
company: string
period: string
highlights: string[]
}[]
education: {
degree: string
school: string
year: string
details?: string
}[]
skills: {
category: string
items: string[]
}[]
certifications: string[]
}
interface ContractData {
type: string
effectiveDate: string
parties: {
role: string
name: string
address: string
contact: string
}[]
term: {
duration: string
autoRenewal: boolean
cancellationNotice: string
}
financials: {
monthlyFee: number
annualTotal: number
paymentTerms: string
}
keyTerms: string[]
}
// ============================================================================
// Extractors
// ============================================================================
async function extractInvoice(document: string): Promise<InvoiceData> {
console.log('\n--- Extracting Invoice Data ---')
const data = await ai`Extract all data from this invoice:
${document}
Provide a complete extraction with:
- invoiceNumber: the invoice number/ID
- invoiceDate: invoice date
- dueDate: payment due date
- billTo: { company, address }
- shipTo: { company, address }
- lineItems: array of { description, quantity (number), unitPrice (number), total (number) }
- subtotal: number
- tax: number
- total: number
- paymentTerms: payment terms`
return data as unknown as InvoiceData
}
async function extractResume(document: string): Promise<ResumeData> {
console.log('\n--- Extracting Resume Data ---')
const data = await ai`Extract all data from this resume:
${document}
Provide a complete extraction with:
- name: candidate name
- title: current/desired title
- contact: { email, phone, linkedin, github }
- summary: professional summary
- experience: array of { title, company, period, highlights: array of achievements }
- education: array of { degree, school, year, details }
- skills: array of { category, items: array of skills }
- certifications: array of certification names`
return data as unknown as ResumeData
}
async function extractContract(document: string): Promise<ContractData> {
console.log('\n--- Extracting Contract Data ---')
const data = await ai`Extract all data from this contract:
${document}
Provide a complete extraction with:
- type: type of agreement
- effectiveDate: when the contract starts
- parties: array of { role (provider/client), name, address, contact }
- term: { duration, autoRenewal: boolean, cancellationNotice }
- financials: { monthlyFee: number, annualTotal: number, paymentTerms }
- keyTerms: array of important contract terms/obligations`
return data as unknown as ContractData
}
// ============================================================================
// Entity Extraction
// ============================================================================
async function extractEntities(text: string): Promise<void> {
console.log('\n--- Entity Extraction ---')
// Extract different entity types
const emails = await extract`all email addresses from: ${text}`
console.log('Emails:', emails)
const phones = await extract`all phone numbers from: ${text}`
console.log('Phones:', phones)
const dates = await extract`all dates and time periods from: ${text}`
console.log('Dates:', dates)
const amounts = await extract`all monetary amounts and numbers from: ${text}`
console.log('Amounts:', amounts)
const companies = await extract`all company/organization names from: ${text}`
console.log('Companies:', companies)
}
// ============================================================================
// Validation
// ============================================================================
async function validateExtraction(
original: string,
extracted: unknown,
docType: string
): Promise<{ valid: boolean; issues: string[] }> {
console.log(`\n--- Validating ${docType} Extraction ---`)
const { valid, issues } = await ai`Validate this extraction against the original document:
Original Document:
${original.substring(0, 1000)}...
Extracted Data:
${JSON.stringify(extracted, null, 2)}
Check for:
- Missing important information
- Incorrect values or parsing errors
- Data type issues (strings vs numbers)
Provide:
- valid: boolean (true if extraction is accurate and complete)
- issues: array of any problems found (empty if valid)`
const validationResult = {
valid: valid as boolean,
issues: issues as string[],
}
console.log(`Valid: ${validationResult.valid}`)
if (validationResult.issues.length > 0) {
console.log('Issues:', validationResult.issues)
}
return validationResult
}
// ============================================================================
// Format Display
// ============================================================================
function displayInvoice(data: InvoiceData): void {
console.log(`
Invoice: ${data.invoiceNumber}
Date: ${data.invoiceDate} | Due: ${data.dueDate}
Bill To: ${data.billTo?.company}
${data.billTo?.address}
Items:`)
for (const item of data.lineItems || []) {
console.log(` - ${item.description}: ${item.quantity} x $${item.unitPrice} = $${item.total}`)
}
console.log(`
Subtotal: $${data.subtotal}
Tax: $${data.tax}
TOTAL: $${data.total}`)
}
function displayResume(data: ResumeData): void {
console.log(`
Candidate: ${data.name}
Title: ${data.title}
Contact: ${data.contact?.email} | ${data.contact?.phone}
Summary: ${data.summary?.substring(0, 100)}...
Experience:`)
for (const exp of data.experience || []) {
console.log(` ${exp.title} @ ${exp.company} (${exp.period})`)
}
console.log(`
Education:`)
for (const edu of data.education || []) {
console.log(` ${edu.degree} - ${edu.school} (${edu.year})`)
}
console.log(`
Skills: ${(data.skills || [])
.flatMap((s) => s.items)
.slice(0, 10)
.join(', ')}...
Certifications: ${(data.certifications || []).length}`)
}
function displayContract(data: ContractData): void {
console.log(`
Contract Type: ${data.type}
Effective: ${data.effectiveDate}
Parties:`)
for (const party of data.parties || []) {
console.log(` ${party.role}: ${party.name}`)
}
console.log(`
Term: ${data.term?.duration}
Auto-Renewal: ${data.term?.autoRenewal}
Financials:
Monthly: $${data.financials?.monthlyFee}
Annual: $${data.financials?.annualTotal}
Key Terms: ${(data.keyTerms || []).length} extracted`)
}
// ============================================================================
// Main Example
// ============================================================================
async function main() {
console.log('\n=== Document Data Extraction Example ===\n')
// Configure the AI provider
configure({
model: 'sonnet',
provider: 'anthropic',
})
// Extract and display invoice
const invoiceData = await extractInvoice(sampleInvoice)
displayInvoice(invoiceData)
await validateExtraction(sampleInvoice, invoiceData, 'Invoice')
// Extract and display resume
const resumeData = await extractResume(sampleResume)
displayResume(resumeData)
await validateExtraction(sampleResume, resumeData, 'Resume')
// Extract and display contract
const contractData = await extractContract(sampleContract)
displayContract(contractData)
await validateExtraction(sampleContract, contractData, 'Contract')
// General entity extraction demo
await extractEntities(sampleResume)
}
main()
.then(() => {
console.log('\n=== Example Complete ===\n')
process.exit(0)
})
.catch((error) => {
console.error('\nError:', error.message)
process.exit(1)
})