@earvinpiamonte/pagasa-tcb-parser
Version:
A TypeScript library for parsing PAGASA weather bulletin PDF files
210 lines • 8.51 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseArea = exports.containsAreaNames = exports.mergeAreas = exports.parseAreasText = exports.extractTcwsAreaText = exports.extractRegionsFromBlock = void 0;
const patterns_1 = require("../constants/patterns");
const text_utils_1 = require("../utils/text-utils");
const extractRegionsFromBlock = (block) => {
const rawAreaText = (0, exports.extractTcwsAreaText)(block);
const parsedAreas = rawAreaText ? (0, exports.parseAreasText)(rawAreaText) : [];
return {
luzon: (0, exports.mergeAreas)(parsedAreas),
visayas: [],
mindanao: [],
};
};
exports.extractRegionsFromBlock = extractRegionsFromBlock;
const extractTcwsAreaText = (block) => {
const lines = block
.split("\n")
.map((line) => line.trim())
.filter((line) => line.length > 0);
let areaText = "";
let signalFound = false;
let collecting = false;
for (const line of lines) {
if (patterns_1.PATTERNS.signalNumber.test(line)) {
signalFound = true;
collecting = false;
continue;
}
if (patterns_1.PATTERNS.tcwsNumber.test(line)) {
signalFound = true;
collecting = false;
continue;
}
if (/warning lead time/i.test(line)) {
if (collecting) {
break;
}
continue;
}
if (patterns_1.PATTERNS.skipLine.test(line) || patterns_1.PATTERNS.regionHeading.test(line)) {
continue;
}
if (signalFound && !collecting) {
const cleanedCandidate = line
.replace(patterns_1.PATTERNS.trailingMultipleDash, "")
.replace(patterns_1.PATTERNS.trailingDash, "");
const lower = cleanedCandidate.toLowerCase();
if (patterns_1.PATTERNS.areaFiller.test(lower)) {
continue;
}
if ((0, exports.containsAreaNames)(cleanedCandidate)) {
collecting = true;
areaText = cleanedCandidate;
}
}
else if (collecting) {
if (patterns_1.PATTERNS.dashOnly.test(line) ||
line === "-" ||
patterns_1.PATTERNS.areaFiller.test(line.toLowerCase())) {
continue;
}
const cleaned = line.replace(patterns_1.PATTERNS.trailingDash, "");
if (!cleaned) {
continue;
}
if (patterns_1.PATTERNS.signalNumber.test(cleaned)) {
break;
}
areaText += " " + cleaned;
}
}
return areaText
.trim()
.replace(patterns_1.PATTERNS.cleanExtra, "")
.replace(patterns_1.PATTERNS.normalizeSpace, " ");
};
exports.extractTcwsAreaText = extractTcwsAreaText;
const parseAreasText = (text) => {
const cleanText = text
.replace(patterns_1.PATTERNS.normalizeSpace, " ")
.replace(/([,;])\s+and\s+/g, "$1 ")
.trim();
const segments = (0, text_utils_1.splitPreservingParentheses)(cleanText);
return segments
.map((segment) => (0, exports.parseArea)(segment.trim()))
.filter((detail) => detail !== null);
};
exports.parseAreasText = parseAreasText;
const mergeAreas = (areas) => {
const merged = new Map();
const mergeUnique = (target, incoming) => {
if (!incoming || incoming.length === 0) {
return target;
}
if (!target) {
return [...incoming];
}
for (const item of incoming) {
if (!target.includes(item)) {
target.push(item);
}
}
return target;
};
for (const area of areas) {
const key = area.name.toLowerCase();
if (merged.has(key)) {
const existing = merged.get(key);
existing.parts = mergeUnique(existing.parts, area.parts);
existing.locals = mergeUnique(existing.locals, area.locals);
existing.islands = mergeUnique(existing.islands, area.islands);
}
else {
merged.set(key, {
name: area.name,
parts: area.parts ? [...area.parts] : undefined,
locals: area.locals ? [...area.locals] : undefined,
islands: area.islands ? [...area.islands] : undefined,
});
}
}
return Array.from(merged.values());
};
exports.mergeAreas = mergeAreas;
const containsAreaNames = (line) => {
if (patterns_1.PATTERNS.skipMetadata.test(line) || patterns_1.PATTERNS.skipLine.test(line)) {
return false;
}
// Clean trailing placeholder dash columns (e.g. "Batanes - -" -> "Batanes") often present in tabular TCWS listings
const cleanedLine = line.replace(/\s+(?:-\s*){1,3}$/g, "").trim();
const lower = cleanedLine.toLowerCase();
if (patterns_1.PATTERNS.areaFiller.test(lower)) {
return false;
}
// Area lines often contain commas/and-separated phrases with optional portion/rest/mainland keywords, e.g. "northern portion of Cagayan, Ilocos Norte and Abra"
const hasListDelimiters = /,|;|\band\b/i.test(cleanedLine);
const hasAreaKeywords = patterns_1.PATTERNS.areaLineKeywords.test(cleanedLine);
// Starts with a known signal header is already filtered. Keep lines with title-case words and avoid all-caps metadata
const looksLikeProperNouns = /\b[A-Z][a-z'’\-]+(?:\s+[A-Z][a-z'’\-]+)*\b/.test(cleanedLine) ||
/\b[A-Z]{3,}\b/.test(cleanedLine);
// Additionally accept a single proper noun possibly with parenthetical qualifier (e.g., "Batanes (Itbayat)")
const singleIslandPattern = /^[A-Z][A-Za-z'’\-]+(?:\s+\([A-Za-z'’\-]+\))?$/;
return (((hasListDelimiters || hasAreaKeywords) && looksLikeProperNouns) ||
singleIslandPattern.test(cleanedLine));
};
exports.containsAreaNames = containsAreaNames;
const parseArea = (areaText) => {
const cleanArea = areaText.trim().replace(patterns_1.PATTERNS.cleanExtra, "");
if (!cleanArea || cleanArea.length < 3) {
return null;
}
const partDescriptors = [];
let workingArea = cleanArea;
// Extract portion patterns
const portionMatch = workingArea.match(patterns_1.PATTERNS.portionPattern);
if (portionMatch) {
partDescriptors.push(portionMatch[2].toLowerCase());
workingArea = portionMatch[3];
}
// Handle "rest of" and "mainland"
if (patterns_1.PATTERNS.restPattern.test(workingArea)) {
partDescriptors.push("rest");
workingArea = workingArea.replace(patterns_1.PATTERNS.restPattern, "");
}
if (workingArea.toLowerCase().includes("mainland")) {
partDescriptors.push("mainland");
workingArea = workingArea.replace(/mainland\s+/i, "");
}
// Extract additional portion descriptors
const additionalMatch = workingArea.match(patterns_1.PATTERNS.additionalPortion);
if (additionalMatch) {
additionalMatch.forEach((portion) => {
const p = portion.toLowerCase();
if (!partDescriptors.includes(p)) {
partDescriptors.push(p);
}
});
}
// Clean remaining portion text
workingArea = workingArea.replace(patterns_1.PATTERNS.cleanPortion, "");
// Handle "including" lists, e.g. "Cagayan including Babuyan Islands"
let islands = [];
// allow common punctuation after 'including' (e.g. 'including:', 'including -', 'including;')
const includingMatch = workingArea.match(/\bincluding\b[\s:;,\-\u2013\u2014]*(.+)$/i);
if (includingMatch) {
// preserve original 'and' replacement then split on commas or 'and' separators
const listText = includingMatch[1]
.replace(/\band\b/gi, ",")
.split(/\s*,\s*|\s+and\s+/i)
.map((s) => s.trim())
.filter((s) => s.length > 0);
islands = listText;
workingArea = workingArea.replace(includingMatch[0], "").trim();
}
const { name, municipalities } = (0, text_utils_1.extractMunicipalities)(workingArea);
const result = { name: name.trim() };
if (islands.length > 0) {
result.islands = islands;
}
if (partDescriptors.length > 0) {
result.parts = partDescriptors;
}
if (municipalities.length > 0) {
result.locals = municipalities;
}
return result;
};
exports.parseArea = parseArea;
//# sourceMappingURL=area-parser.js.map