@ansvar/singapore-law-mcp
Version:
Complete Singapore law database — 523 Acts, 28K+ provisions from Singapore Statutes Online (sso.agc.gov.sg) with full-text search, definitions, and citation support
313 lines • 13.3 kB
JavaScript
/**
* Singapore Law MCP -- Census-Driven Ingestion Pipeline
*
* Fetches ALL Singapore legislation from Singapore Statutes Online (sso.agc.gov.sg)
* using data/census.json as the source of truth for what to ingest.
*
* SSO provides public access to all current Singapore legislation under the
* Singapore Open Data Licence.
*
* The pipeline performs the same sequence a browser would:
* 1. GET the Act page (receives Part 1 inline + ToC with series IDs)
* 2. For each series ID, GET /Details/GetLazyLoadContent to fetch that part
* 3. Parse the combined HTML into structured provision data
*
* Usage:
* npm run ingest # Full census-driven ingestion
* npm run ingest -- --limit 5 # Test with 5 acts
* npm run ingest -- --skip-fetch # Reuse cached HTML
* npm run ingest -- --resume # Skip acts that already have seed files
* npm run ingest -- --start-from ID # Resume from a specific act ID
*
* Data is sourced under the Singapore Open Data Licence.
*/
import * as fs from 'fs';
import * as path from 'path';
import { fileURLToPath } from 'url';
import { fetchFullAct } from './lib/fetcher.js';
import { parseSsoAct } from './lib/parser.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const SOURCE_DIR = path.resolve(__dirname, '../data/source');
const SEED_DIR = path.resolve(__dirname, '../data/seed');
const CENSUS_PATH = path.resolve(__dirname, '../data/census.json');
function parseArgs() {
const args = process.argv.slice(2);
let limit = null;
let skipFetch = false;
let resume = false;
let startFrom = null;
for (let i = 0; i < args.length; i++) {
if (args[i] === '--limit' && args[i + 1]) {
limit = parseInt(args[i + 1], 10);
i++;
}
else if (args[i] === '--skip-fetch') {
skipFetch = true;
}
else if (args[i] === '--resume') {
resume = true;
}
else if (args[i] === '--start-from' && args[i + 1]) {
startFrom = args[i + 1];
resume = true;
i++;
}
}
return { limit, skipFetch, resume, startFrom };
}
/**
* Convert a census entry to an ActIndexEntry compatible with the existing parser.
*/
function censusToActEntry(law) {
return {
id: law.id,
actCode: `Act/${law.identifier}`,
title: law.title,
titleEn: law.title,
abbreviation: law.identifier,
status: law.status,
issuedDate: '',
inForceDate: '',
url: law.url,
};
}
/**
* Load census file.
*/
function loadCensus() {
if (!fs.existsSync(CENSUS_PATH)) {
throw new Error(`Census file not found: ${CENSUS_PATH}\nRun: npx tsx scripts/census.ts`);
}
return JSON.parse(fs.readFileSync(CENSUS_PATH, 'utf-8'));
}
/**
* Update census.json with ingestion results.
*/
function updateCensus(census, results) {
const today = new Date().toISOString().split('T')[0];
for (const law of census.laws) {
const result = results.get(law.id);
if (result && !result.failed) {
law.ingested = true;
law.provision_count = result.provisions;
law.ingestion_date = today;
}
}
// Recalculate summary
census.summary.total_laws = census.laws.length;
census.summary.ingestable = census.laws.filter(l => l.classification === 'ingestable').length;
fs.writeFileSync(CENSUS_PATH, JSON.stringify(census, null, 2));
}
async function fetchAndParseActs(acts, skipFetch, resume) {
console.log(`\nProcessing ${acts.length} Singapore Acts...\n`);
fs.mkdirSync(SOURCE_DIR, { recursive: true });
fs.mkdirSync(SEED_DIR, { recursive: true });
let processed = 0;
let skipped = 0;
let failed = 0;
let totalProvisions = 0;
let totalDefinitions = 0;
let consecutiveFailures = 0;
const MAX_CONSECUTIVE_FAILURES = 10;
const results = new Map();
const actResults = [];
for (const act of acts) {
const sourceFile = path.join(SOURCE_DIR, `${act.id}.html`);
const chunksDir = path.join(SOURCE_DIR, `${act.id}-chunks`);
const seedFile = path.join(SEED_DIR, `${act.id}.json`);
// Resume: skip if seed already exists
if (resume && fs.existsSync(seedFile)) {
try {
const existing = JSON.parse(fs.readFileSync(seedFile, 'utf-8'));
totalProvisions += existing.provisions.length;
totalDefinitions += existing.definitions.length;
results.set(act.id, { provisions: existing.provisions.length, failed: false });
skipped++;
processed++;
// Only log every 50th skip to reduce noise
if (skipped % 50 === 0) {
console.log(` ... skipped ${skipped} existing acts so far`);
}
continue;
}
catch {
// Seed file is corrupt, re-ingest
}
}
// Skip-fetch + seed exists: use cached seed
if (skipFetch && fs.existsSync(seedFile)) {
const existing = JSON.parse(fs.readFileSync(seedFile, 'utf-8'));
console.log(` SKIP ${act.abbreviation} (cached: ${existing.provisions.length} provisions)`);
totalProvisions += existing.provisions.length;
totalDefinitions += existing.definitions.length;
results.set(act.id, { provisions: existing.provisions.length, failed: false });
actResults.push({
id: act.id,
abbr: act.abbreviation,
provisions: existing.provisions.length,
definitions: existing.definitions.length,
status: 'cached',
});
skipped++;
processed++;
continue;
}
try {
let parsed;
if (skipFetch && fs.existsSync(sourceFile)) {
// Reuse cached HTML + chunks
const initialHtml = fs.readFileSync(sourceFile, 'utf-8');
const bodyChunksHtml = [];
const tailChunksHtml = [];
if (fs.existsSync(chunksDir)) {
const chunkFiles = fs.readdirSync(chunksDir).sort();
for (const cf of chunkFiles) {
const chunkContent = fs.readFileSync(path.join(chunksDir, cf), 'utf-8');
if (cf.startsWith('body-'))
bodyChunksHtml.push(chunkContent);
else if (cf.startsWith('tail-'))
tailChunksHtml.push(chunkContent);
}
}
const fetchResult = {
initialHtml,
bodyChunksHtml,
tailChunksHtml,
chunksLoaded: bodyChunksHtml.length + tailChunksHtml.length,
};
parsed = parseSsoAct(fetchResult, act);
console.log(` ${act.abbreviation}: parsed from cache (${fetchResult.chunksLoaded} chunks)`);
}
else {
// Full fetch: initial page + all lazy-loaded chunks
const progress = `[${processed + 1}/${acts.length}]`;
process.stdout.write(` ${progress} Fetching ${act.abbreviation} (${act.id})...`);
const fetchResult = await fetchFullAct(act.url);
// Cache the initial HTML
fs.writeFileSync(sourceFile, fetchResult.initialHtml);
// Cache the chunks
fs.mkdirSync(chunksDir, { recursive: true });
fetchResult.bodyChunksHtml.forEach((chunk, i) => {
fs.writeFileSync(path.join(chunksDir, `body-${i}.html`), chunk);
});
fetchResult.tailChunksHtml.forEach((chunk, i) => {
fs.writeFileSync(path.join(chunksDir, `tail-${i}.html`), chunk);
});
const totalKb = (fetchResult.initialHtml.length +
fetchResult.bodyChunksHtml.reduce((s, c) => s + c.length, 0) +
fetchResult.tailChunksHtml.reduce((s, c) => s + c.length, 0)) / 1024;
console.log(` OK (${totalKb.toFixed(0)} KB, ${fetchResult.chunksLoaded} chunks)`);
parsed = parseSsoAct(fetchResult, act);
}
fs.writeFileSync(seedFile, JSON.stringify(parsed, null, 2));
totalProvisions += parsed.provisions.length;
totalDefinitions += parsed.definitions.length;
consecutiveFailures = 0;
console.log(` -> ${parsed.provisions.length} provisions, ${parsed.definitions.length} definitions`);
results.set(act.id, { provisions: parsed.provisions.length, failed: false });
actResults.push({
id: act.id,
abbr: act.abbreviation,
provisions: parsed.provisions.length,
definitions: parsed.definitions.length,
status: 'ok',
});
}
catch (error) {
const msg = error instanceof Error ? error.message : String(error);
console.log(` ERROR ${act.abbreviation}: ${msg}`);
results.set(act.id, { provisions: 0, failed: true });
actResults.push({
id: act.id,
abbr: act.abbreviation,
provisions: 0,
definitions: 0,
status: `FAILED: ${msg.substring(0, 80)}`,
});
failed++;
consecutiveFailures++;
if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
console.log(`\n FATAL: ${MAX_CONSECUTIVE_FAILURES} consecutive failures. Stopping.`);
console.log(` Re-run with --resume to continue from where you left off.`);
break;
}
}
processed++;
}
// Print summary
if (skipped > 0) {
console.log(`\n Skipped ${skipped} existing acts (resume mode)`);
}
console.log(`\n${'='.repeat(60)}`);
console.log('INGESTION REPORT');
console.log('='.repeat(60));
console.log(`\n Acts processed: ${processed}`);
console.log(` Acts skipped (existing): ${skipped}`);
console.log(` Acts failed: ${failed}`);
console.log(` Total provisions: ${totalProvisions}`);
console.log(` Total definitions: ${totalDefinitions}`);
// Only print per-act breakdown for newly ingested acts (not resume-skipped)
const newResults = actResults.filter(r => r.status !== 'cached');
if (newResults.length > 0 && newResults.length <= 100) {
console.log(`\n Per-Act breakdown (newly ingested):`);
console.log(` ${'Act'.padEnd(20)} ${'Provisions'.padStart(12)} ${'Definitions'.padStart(13)} Status`);
console.log(` ${'-'.repeat(20)} ${'-'.repeat(12)} ${'-'.repeat(13)} ${'-'.repeat(30)}`);
for (const r of newResults) {
console.log(` ${r.abbr.substring(0, 20).padEnd(20)} ${String(r.provisions).padStart(12)} ${String(r.definitions).padStart(13)} ${r.status}`);
}
}
console.log();
return results;
}
async function main() {
const { limit, skipFetch, resume, startFrom } = parseArgs();
console.log('Singapore Law MCP -- Census-Driven Ingestion Pipeline');
console.log('=====================================================\n');
console.log(` Source: Singapore Statutes Online (sso.agc.gov.sg)`);
console.log(` License: Singapore Open Data Licence`);
console.log(` Rate limit: 500ms between requests`);
console.log(` Census: ${CENSUS_PATH}`);
if (limit)
console.log(` --limit ${limit}`);
if (skipFetch)
console.log(` --skip-fetch`);
if (resume)
console.log(` --resume`);
if (startFrom)
console.log(` --start-from ${startFrom}`);
// Load census
const census = loadCensus();
console.log(`\n Census: ${census.summary.total_laws} total, ${census.summary.ingestable} ingestable`);
// Filter to ingestable acts
let ingestable = census.laws.filter(l => l.classification === 'ingestable');
// Apply --start-from
if (startFrom) {
const idx = ingestable.findIndex(l => l.id === startFrom);
if (idx === -1) {
console.error(` ERROR: Act ID "${startFrom}" not found in census`);
process.exit(1);
}
ingestable = ingestable.slice(idx);
console.log(` Starting from ${startFrom} (${ingestable.length} remaining)`);
}
// Apply --limit
if (limit) {
ingestable = ingestable.slice(0, limit);
}
// Convert to ActIndexEntry format
const acts = ingestable.map(censusToActEntry);
const results = await fetchAndParseActs(acts, skipFetch, resume);
// Update census with results
updateCensus(census, results);
console.log(` Census updated: ${CENSUS_PATH}`);
// Final summary
const ingested = census.laws.filter(l => l.ingested).length;
console.log(` Ingestion status: ${ingested}/${census.summary.ingestable} acts ingested`);
}
main().catch(error => {
console.error('Fatal error:', error);
process.exit(1);
});
//# sourceMappingURL=ingest.js.map