mwoffliner
Version:
MediaWiki ZIM scraper
185 lines • 9.93 kB
JavaScript
import Downloader from '../Downloader.js';
import RedisStore from '../RedisStore.js';
import * as logger from '../Logger.js';
import { getArticlesByIds } from './mw-api.js';
import { deDup } from './misc.js';
export async function getCategoriesForArticles(articleStore, deleteArticleStore = false) {
const { articleDetailXId } = RedisStore;
const nextCategoriesBatch = RedisStore.createRedisKvs(`${Date.now()}-request`);
logger.log(`Fetching categories for [${await articleStore.len()}] articles`);
await articleStore.iterateItems(Downloader.speed, async (articleKeyValuePairs, runningWorkers) => {
const articleKeys = Object.keys(articleKeyValuePairs);
logger.log(`Worker getting categories for articles [${logger.logifyArray(articleKeys)}] - ${runningWorkers} worker(s) running`);
const pagesXCategoryId = Object.entries(articleKeyValuePairs).reduce((acc, [, detail]) => {
for (const cat of detail.categories || []) {
const catId = cat.title;
acc[catId] = (acc[catId] || []).concat({ title: detail.title, ns: detail.ns });
}
return acc;
}, {});
const foundCategoryIds = Object.keys(pagesXCategoryId);
if (foundCategoryIds.length) {
const existingArticles = await articleDetailXId.getMany(foundCategoryIds);
const categoriesToGet = Object.entries(existingArticles)
.filter(([, detail]) => !detail)
.map(([id]) => id);
if (categoriesToGet.length) {
await getArticlesByIds(categoriesToGet, false);
}
const catDetails = await articleDetailXId.getMany(foundCategoryIds);
for (const [id, detail] of Object.entries(catDetails)) {
if (!detail) {
continue;
}
const parentCategories = (detail.categories || []).reduce((acc, info) => {
acc[info.title] = info;
return acc;
}, {});
await nextCategoriesBatch.setMany(parentCategories);
detail.pages = (detail.pages || []).concat(pagesXCategoryId[id]);
await articleDetailXId.set(id, detail);
}
}
});
if (deleteArticleStore) {
await articleStore.flush();
}
const nextBatchSize = await nextCategoriesBatch.len();
if (nextBatchSize) {
return getCategoriesForArticles(nextCategoriesBatch, true);
}
else {
return null;
}
}
export async function trimUnmirroredPages() {
const { articleDetailXId } = RedisStore;
logger.log(`Trimming un-mirrored articles for [${await articleDetailXId.len()}] articles`);
const numKeys = await articleDetailXId.len();
let prevPercentProgress = -1;
let processedArticles = 0;
let modifiedArticles = 0;
await articleDetailXId.iterateItems(Downloader.speed, async (articleKeyValuePairs) => {
for (const [articleId, articleDetail] of Object.entries(articleKeyValuePairs)) {
processedArticles += 1;
if (typeof articleDetail.missing === 'string') {
await articleDetailXId.delete(articleId);
modifiedArticles += 1;
// TODO: remove references to current article on delete
continue;
}
const categoriesXId = (articleDetail.categories || []).reduce((acc, c) => {
acc[c.title] = c;
return acc;
}, {});
const categoryIds = Object.keys(categoriesXId);
const subCategoriesXId = (articleDetail.subCategories || []).reduce((acc, c) => {
acc[c.title] = c;
return acc;
}, {});
const subCategoryIds = Object.keys(subCategoriesXId);
const pagesXId = (articleDetail.pages || []).reduce((acc, c) => {
acc[c.title] = c;
return acc;
}, {});
const pageIds = Object.keys(pagesXId);
const [categoriesExist, subCategoriesExist, pagesExist] = await Promise.all([
categoryIds.length ? articleDetailXId.existsMany(categoryIds, true) : Promise.resolve({}),
subCategoryIds.length ? articleDetailXId.existsMany(subCategoryIds, true) : Promise.resolve({}),
pageIds.length ? articleDetailXId.existsMany(pageIds, true) : Promise.resolve({}),
]);
const existingCategories = Object.keys(categoriesExist).filter((key) => categoriesExist[key]);
const existingSubCategories = Object.keys(subCategoriesExist).filter((key) => subCategoriesExist[key]);
const existingPages = Object.keys(pagesExist).filter((key) => pagesExist[key]);
let hasUpdated = false;
const newCategoryKeys = deDup(existingCategories || [], (p) => p);
const newCategories = newCategoryKeys.map((key) => categoriesXId[key]);
if (newCategories.length !== categoryIds.length) {
articleDetail.categories = newCategories;
hasUpdated = true;
}
const newSubCategoryKeys = deDup(existingSubCategories || [], (p) => p);
const newSubCategories = newSubCategoryKeys.map((key) => subCategoriesXId[key]);
if (newSubCategories.length !== subCategoryIds.length) {
articleDetail.subCategories = newSubCategories;
hasUpdated = true;
}
const newPageKeys = deDup(existingPages || [], (p) => p);
const newPages = newPageKeys.map((key) => pagesXId[key]);
if (newPages.length !== pageIds.length) {
articleDetail.pages = newPages;
hasUpdated = true;
}
if (hasUpdated) {
await articleDetailXId.set(articleId, articleDetail);
modifiedArticles += 1;
}
if (processedArticles % 100 === 0) {
const percentProgress = Math.floor((processedArticles / numKeys) * 1000) / 10;
if (percentProgress !== prevPercentProgress) {
prevPercentProgress = percentProgress;
logger.log(`Progress trimming un-mirrored articles [${processedArticles}/${numKeys}] [${percentProgress}%]`);
}
}
}
});
return modifiedArticles;
}
export async function simplifyGraph() {
logger.log('Simplifying graph (removing empty categories)');
const { articleDetailXId } = RedisStore;
const numKeys = await articleDetailXId.len();
let prevPercentProgress = -1;
let processedArticles = 0;
let deletedNodes = 0;
await articleDetailXId.iterateItems(Downloader.speed, async (articleKeyValuePairs) => {
for (const [articleId, articleDetail] of Object.entries(articleKeyValuePairs)) {
processedArticles += 1;
if (articleDetail.ns !== 14) {
continue; // Only trim category articles
}
const subArticles = (articleDetail.subCategories || []).concat(articleDetail.pages || []);
const shouldRemoveNode = subArticles.length <= 3;
if (shouldRemoveNode) {
// Update sub pages
// Add parent categories to child pages
const hasPages = articleDetail.pages && articleDetail.pages.length;
const scrapedPages = hasPages ? await articleDetailXId.getMany(articleDetail.pages.map((p) => p.title)) : {};
for (const [pageId, pageDetail] of Object.entries(scrapedPages)) {
if (pageDetail) {
pageDetail.categories = (pageDetail.categories || [])
.filter((c) => c && c.title !== articleDetail.title) // remove self
.concat(articleDetail.categories || []); // add parent categories
pageDetail.categories = deDup(pageDetail.categories, (o) => o.title);
await articleDetailXId.set(pageId, pageDetail);
}
}
// Update parent categories
// Add children to parent categories
const hasCategories = articleDetail.categories && articleDetail.categories.length;
const scrapedCategories = hasCategories ? await articleDetailXId.getMany(articleDetail.categories.map((p) => p.title)) : {};
for (const [catId, catDetail] of Object.entries(scrapedCategories)) {
if (catDetail) {
const categoryDetail = Object.assign({ pages: [], subCategories: [] }, catDetail || {});
categoryDetail.pages = categoryDetail.pages.concat(articleDetail.pages);
categoryDetail.subCategories = categoryDetail.subCategories.concat(articleDetail.subCategories).filter((c) => c.title === articleDetail.title);
categoryDetail.pages = deDup(categoryDetail.pages, (o) => o.title);
categoryDetail.subCategories = deDup(categoryDetail.subCategories, (o) => o.title);
await articleDetailXId.set(catId, categoryDetail);
}
}
await articleDetailXId.delete(articleId);
deletedNodes += 1;
}
}
if (processedArticles % 10 === 0) {
const percentProgress = Math.floor((processedArticles / numKeys) * 1000) / 10;
if (percentProgress !== prevPercentProgress) {
prevPercentProgress = percentProgress;
logger.log(`Progress simplifying graph [${processedArticles}/${numKeys}] [${percentProgress}%] deleted [${deletedNodes}]`);
}
}
});
return { deletedNodes };
}
//# sourceMappingURL=categories.js.map