UNPKG

@earvinpiamonte/pagasa-tcb-parser

Version:

A TypeScript library for parsing PAGASA weather bulletin PDF files

210 lines 8.51 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseArea = exports.containsAreaNames = exports.mergeAreas = exports.parseAreasText = exports.extractTcwsAreaText = exports.extractRegionsFromBlock = void 0; const patterns_1 = require("../constants/patterns"); const text_utils_1 = require("../utils/text-utils"); const extractRegionsFromBlock = (block) => { const rawAreaText = (0, exports.extractTcwsAreaText)(block); const parsedAreas = rawAreaText ? (0, exports.parseAreasText)(rawAreaText) : []; return { luzon: (0, exports.mergeAreas)(parsedAreas), visayas: [], mindanao: [], }; }; exports.extractRegionsFromBlock = extractRegionsFromBlock; const extractTcwsAreaText = (block) => { const lines = block .split("\n") .map((line) => line.trim()) .filter((line) => line.length > 0); let areaText = ""; let signalFound = false; let collecting = false; for (const line of lines) { if (patterns_1.PATTERNS.signalNumber.test(line)) { signalFound = true; collecting = false; continue; } if (patterns_1.PATTERNS.tcwsNumber.test(line)) { signalFound = true; collecting = false; continue; } if (/warning lead time/i.test(line)) { if (collecting) { break; } continue; } if (patterns_1.PATTERNS.skipLine.test(line) || patterns_1.PATTERNS.regionHeading.test(line)) { continue; } if (signalFound && !collecting) { const cleanedCandidate = line .replace(patterns_1.PATTERNS.trailingMultipleDash, "") .replace(patterns_1.PATTERNS.trailingDash, ""); const lower = cleanedCandidate.toLowerCase(); if (patterns_1.PATTERNS.areaFiller.test(lower)) { continue; } if ((0, exports.containsAreaNames)(cleanedCandidate)) { collecting = true; areaText = cleanedCandidate; } } else if (collecting) { if (patterns_1.PATTERNS.dashOnly.test(line) || line === "-" || patterns_1.PATTERNS.areaFiller.test(line.toLowerCase())) { continue; } const cleaned = line.replace(patterns_1.PATTERNS.trailingDash, ""); if (!cleaned) { continue; } if (patterns_1.PATTERNS.signalNumber.test(cleaned)) { break; } areaText += " " + cleaned; } } return areaText .trim() .replace(patterns_1.PATTERNS.cleanExtra, "") .replace(patterns_1.PATTERNS.normalizeSpace, " "); }; exports.extractTcwsAreaText = extractTcwsAreaText; const parseAreasText = (text) => { const cleanText = text .replace(patterns_1.PATTERNS.normalizeSpace, " ") .replace(/([,;])\s+and\s+/g, "$1 ") .trim(); const segments = (0, text_utils_1.splitPreservingParentheses)(cleanText); return segments .map((segment) => (0, exports.parseArea)(segment.trim())) .filter((detail) => detail !== null); }; exports.parseAreasText = parseAreasText; const mergeAreas = (areas) => { const merged = new Map(); const mergeUnique = (target, incoming) => { if (!incoming || incoming.length === 0) { return target; } if (!target) { return [...incoming]; } for (const item of incoming) { if (!target.includes(item)) { target.push(item); } } return target; }; for (const area of areas) { const key = area.name.toLowerCase(); if (merged.has(key)) { const existing = merged.get(key); existing.parts = mergeUnique(existing.parts, area.parts); existing.locals = mergeUnique(existing.locals, area.locals); existing.islands = mergeUnique(existing.islands, area.islands); } else { merged.set(key, { name: area.name, parts: area.parts ? [...area.parts] : undefined, locals: area.locals ? [...area.locals] : undefined, islands: area.islands ? [...area.islands] : undefined, }); } } return Array.from(merged.values()); }; exports.mergeAreas = mergeAreas; const containsAreaNames = (line) => { if (patterns_1.PATTERNS.skipMetadata.test(line) || patterns_1.PATTERNS.skipLine.test(line)) { return false; } // Clean trailing placeholder dash columns (e.g. "Batanes - -" -> "Batanes") often present in tabular TCWS listings const cleanedLine = line.replace(/\s+(?:-\s*){1,3}$/g, "").trim(); const lower = cleanedLine.toLowerCase(); if (patterns_1.PATTERNS.areaFiller.test(lower)) { return false; } // Area lines often contain commas/and-separated phrases with optional portion/rest/mainland keywords, e.g. "northern portion of Cagayan, Ilocos Norte and Abra" const hasListDelimiters = /,|;|\band\b/i.test(cleanedLine); const hasAreaKeywords = patterns_1.PATTERNS.areaLineKeywords.test(cleanedLine); // Starts with a known signal header is already filtered. Keep lines with title-case words and avoid all-caps metadata const looksLikeProperNouns = /\b[A-Z][a-z'’\-]+(?:\s+[A-Z][a-z'’\-]+)*\b/.test(cleanedLine) || /\b[A-Z]{3,}\b/.test(cleanedLine); // Additionally accept a single proper noun possibly with parenthetical qualifier (e.g., "Batanes (Itbayat)") const singleIslandPattern = /^[A-Z][A-Za-z'’\-]+(?:\s+\([A-Za-z'’\-]+\))?$/; return (((hasListDelimiters || hasAreaKeywords) && looksLikeProperNouns) || singleIslandPattern.test(cleanedLine)); }; exports.containsAreaNames = containsAreaNames; const parseArea = (areaText) => { const cleanArea = areaText.trim().replace(patterns_1.PATTERNS.cleanExtra, ""); if (!cleanArea || cleanArea.length < 3) { return null; } const partDescriptors = []; let workingArea = cleanArea; // Extract portion patterns const portionMatch = workingArea.match(patterns_1.PATTERNS.portionPattern); if (portionMatch) { partDescriptors.push(portionMatch[2].toLowerCase()); workingArea = portionMatch[3]; } // Handle "rest of" and "mainland" if (patterns_1.PATTERNS.restPattern.test(workingArea)) { partDescriptors.push("rest"); workingArea = workingArea.replace(patterns_1.PATTERNS.restPattern, ""); } if (workingArea.toLowerCase().includes("mainland")) { partDescriptors.push("mainland"); workingArea = workingArea.replace(/mainland\s+/i, ""); } // Extract additional portion descriptors const additionalMatch = workingArea.match(patterns_1.PATTERNS.additionalPortion); if (additionalMatch) { additionalMatch.forEach((portion) => { const p = portion.toLowerCase(); if (!partDescriptors.includes(p)) { partDescriptors.push(p); } }); } // Clean remaining portion text workingArea = workingArea.replace(patterns_1.PATTERNS.cleanPortion, ""); // Handle "including" lists, e.g. "Cagayan including Babuyan Islands" let islands = []; // allow common punctuation after 'including' (e.g. 'including:', 'including -', 'including;') const includingMatch = workingArea.match(/\bincluding\b[\s:;,\-\u2013\u2014]*(.+)$/i); if (includingMatch) { // preserve original 'and' replacement then split on commas or 'and' separators const listText = includingMatch[1] .replace(/\band\b/gi, ",") .split(/\s*,\s*|\s+and\s+/i) .map((s) => s.trim()) .filter((s) => s.length > 0); islands = listText; workingArea = workingArea.replace(includingMatch[0], "").trim(); } const { name, municipalities } = (0, text_utils_1.extractMunicipalities)(workingArea); const result = { name: name.trim() }; if (islands.length > 0) { result.islands = islands; } if (partDescriptors.length > 0) { result.parts = partDescriptors; } if (municipalities.length > 0) { result.locals = municipalities; } return result; }; exports.parseArea = parseArea; //# sourceMappingURL=area-parser.js.map