UNPKG

@padrocha/uam-scraping

Version:

Scraping of teachers

417 lines (356 loc) 20.3 kB
#!/usr/bin/env node import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync, createWriteStream } from 'fs'; import puppeteer from 'puppeteer'; import PDF from 'pdfkit'; import { execSync } from 'child_process'; import { config } from './config'; import { week, log, confirm, progressBar, selectUEAS, tryDOM, timeParse, selectSchedule, selectTeachers, password, askUser } from './utils'; (async ({ argv, platform, stdout }: NodeJS.Process) => { try { const ueas = new Array<uea>(); let subjects = new Map<string, ueaSchedule[]>(); let teachers = new Set<string>(); if (existsSync(config.JSONUEA)) { log('Uea´s backup found'); if (await confirm('Do you wanna use it')) { log('Reading Uea´s') const { schedules }: { schedules: [string, ueaData[]][] } = JSON.parse(readFileSync(config.JSONUEA, 'utf8')); if (schedules) { subjects = new Map<string, ueaData[]>(schedules); teachers = Array.from(subjects as Map<string, ueaData[]>).reduce((prev, [, schedule]) => { schedule.forEach(({ teacher }) => { return teacher.name.length > 1 ? prev.add(teacher.name) : false; }); return prev; }, new Set<string>()); } } else { stdout.moveCursor(0, -1); stdout.clearScreenDown(); } } if (subjects.size < 1) { log('Loading page: SIIUAM'); const browser = await puppeteer.launch({ headless: config.HEADLESS }); const SIIUAM = await browser.newPage(); await SIIUAM.goto(config.PATH.SIIUAM, { waitUntil: 'load', timeout: 0 }); SIIUAM.setDefaultNavigationTimeout(config.NAVIGATION); const bodyFrame = await SIIUAM.$('frame#bodyFrame'); const body_frame = await bodyFrame?.contentFrame(); const controlFrame = await body_frame?.$('frame#controlFrame'); const control_frame = await controlFrame?.contentFrame(); const menu_frame = await tryDOM<puppeteer.Frame>(async () => { const menuFrame = await control_frame?.$('frame#menuFrame'); return await menuFrame?.contentFrame(); }, SIIUAM); const USER = config.USER ? config.USER : await askUser(); const PASS = config.PASS ? config.PASS : await password(); log('SIIUAM > Loggin user'); await menu_frame.type('input[name="NOMBRE.IDENTIFICACION.NONMODELED"]', USER); await menu_frame.type('input[name="COMPLEMENTO.IDENTIFICACION.NONMODELED"]', PASS); await menu_frame.click('input[name="GO.IDENTIFICACION.NONMODELED"]'); log('Fetching UEA Schedules'); await menu_frame.waitForNavigation(); await menu_frame.click('a[href="CTWBS012"]'); const info_frame = await tryDOM(async () => { const infoFrame = await body_frame?.$('frame#infoFrame'); return await infoFrame?.contentFrame(); }, SIIUAM); const uea_disponible = await tryDOM(async () => { const ifrmBol = await info_frame.$('iframe#ifrm_bol'); const ifrm_bol = await ifrmBol?.contentFrame(); const _frame = await ifrm_bol?.$('iframe'); const frame = await _frame?.contentFrame(); return await frame?.$$('.celda.uea_disponible'); }, SIIUAM); for await (const uea of uea_disponible) { const id = await uea.getProperty('id'); const key = await id?.jsonValue() as string; const _name = await uea.$('.nombre'); const name = await _name?.evaluate(e => e.textContent) as string; ueas.push({ key, name }); } if (ueas.length < 1) throw new Error("Uea´s not found"); const ueas_selected = await selectUEAS(ueas); const horario_UEA = await tryDOM(async () => { return await menu_frame.$('a[href="IEWBC005.oConsulta"]'); }, SIIUAM); await horario_UEA.click(); await info_frame.waitForNavigation(); let count_ueas = 0; progressBar(); for await (const uea of ueas_selected) { await info_frame.type('input[name="CD_UEA.CONTROL.NONMODELED"]', uea.key + '\n'); await info_frame.waitForNavigation(); const tr_array = await tryDOM(async () => { const fieldset = await info_frame.$('fieldset'); return await fieldset?.$$("tr"); }, SIIUAM); const schedule = new Array<ueaSchedule>(); tr_array.shift(); for await (const tr of tr_array) { const td_array = await tryDOM(async () => { return await tr.$$('td'); }, SIIUAM); const teacher_handle = await td_array[0].getProperty('innerText'); const teacher = await teacher_handle?.jsonValue() as string; const group_handle = await td_array[1].getProperty('innerText'); const group = await group_handle?.jsonValue() as string; const monday_handle = await td_array[4].getProperty('innerText'); const monday = await monday_handle?.jsonValue() as string; const tuesday_handle = await td_array[5].getProperty('innerText'); const tuesday = await tuesday_handle?.jsonValue() as string; const wednesday_handle = await td_array[6].getProperty('innerText'); const wednesday = await wednesday_handle?.jsonValue() as string; const thursday_handle = await td_array[7].getProperty('innerText'); const thursday = await thursday_handle?.jsonValue() as string; const friday_handle = await td_array[8].getProperty('innerText'); const friday = await friday_handle?.jsonValue() as string; schedule.push({ key: uea.key, teacher, group, monday: timeParse(monday), tuesday: timeParse(tuesday), wednesday: timeParse(wednesday), thursday: timeParse(thursday), friday: timeParse(friday) }); } subjects.set(uea.name, schedule); progressBar(ueas_selected.length, ++count_ueas); await info_frame.click('input[name="CD_UEA.CONTROL.NONMODELED"]', { clickCount: 3 }); } if (subjects.size < 1) throw new Error("Subjects not found"); teachers = Array.from(subjects).reduce((prev, [, schedule]) => { schedule.forEach(({ teacher }) => { return (teacher as string).length > 1 ? prev.add(teacher as string) : false; }); return prev; }, new Set<string>()); const teacherData = new Map<string, teacherData>(); log('Closing page: SIIUAM'); await SIIUAM.close(); log('Loading page: misprofesores'); const MISPROFESORES = await browser.newPage(); await MISPROFESORES.goto(config.PATH.MISPROFESORES, { waitUntil: 'load', timeout: 0 }); MISPROFESORES.setDefaultNavigationTimeout(config.NAVIGATION); log('Fetching teachers data'); progressBar(); let count_teachers = 0; for await (const teacher of teachers) { await MISPROFESORES.waitForSelector('input[name="q"]'); await MISPROFESORES.type('input[name="q"]', teacher + '\n'); await MISPROFESORES.waitForSelector('a.gs-title'); const results = await MISPROFESORES.$$('.gsc-webResult.gsc-result'); if (results) for await (const result of results) { const a = await result?.$('a.gs-title'); const teacher_path_handle = await a?.getProperty('href'); const teacher_path = await teacher_path_handle?.jsonValue() as string; const DATA = await browser.newPage(); await DATA.goto(teacher_path, { waitUntil: 'load', timeout: 0 }); DATA.setDefaultNavigationTimeout(15_000_000); try { const quality_element = await DATA.$('.quality .grade'); const quality_handle = await quality_element?.getProperty('innerText'); const quality_text = await quality_handle?.jsonValue() as string; const quality = Number(quality_text); const takeAgain_element = await DATA.$('.takeAgain .grade'); const takeAgain_handle = await takeAgain_element?.getProperty('innerText'); const takeAgain_text = await takeAgain_handle?.jsonValue() as string; const takeAgain = Number(takeAgain_text?.slice(0, -1)); const difficulty_element = await DATA.$('.difficulty .grade'); const difficulty_handle = await difficulty_element?.getProperty('innerText'); const difficulty_text = await difficulty_handle?.jsonValue() as string; const difficulty = Number(difficulty_text); const students_element = await DATA.$('.rating-count'); const students_handle = await students_element?.getProperty('innerText'); const students_text = await students_handle?.jsonValue() as string; const students = Number(students_text.trim().split(/\s/).shift()); teacherData.set(teacher, { name: teacher, quality, takeAgain, difficulty, students }); ++count_teachers; await DATA?.close(); break; } catch { await DATA?.close(); teachers.delete(teacher); } } progressBar(teachers.size, count_teachers); } if (teacherData.size < 1) throw new Error("Teachers data not found"); log('Closing page: misprofesores'); await MISPROFESORES.close(); log('Closing browser'); await browser.close(); subjects = Array.from(subjects).reduce((prev, [subject, schedule]) => { return prev.set(subject, schedule.map(uea => { uea.teacher = teacherData.has(uea.teacher as string) ? teacherData.get(uea.teacher as string) as teacherData : { name: '', quality: 0, takeAgain: 0, difficulty: 0, students: 0 }; return uea as ueaData; }).sort(({ teacher: a }, { teacher: b }) => { const sort_quality = (a.quality < b.quality) ? -1 : ((a.quality > b.quality) ? 1 : 0); const sort_students = (a.students < b.students) ? -1 : ((a.students > b.students) ? 1 : 0); return sort_quality || sort_students; }).reverse()); }, new Map<string, ueaData[]>()); const JSONUEA = JSON.stringify(subjects, (key: string, value: Map<string, ueaSchedule[]>) => { return value instanceof Map ? { subjects: Array.from(value.entries()).map(([k]) => k), schedules: Array.from(value.entries()) } : value; }); writeFileSync(config.JSONUEA, JSONUEA, 'utf-8'); log(`Uea´s data saved as "${config.JSONUEA}"`); } const all_combinations = new Map<string, Map<string, ueaData>>(); log('Choosing the most optimal schedules'); for (const [subject, schedules_root] of subjects as Map<string, ueaData[]>) { const filteres_subjects = Array.from(subjects.keys()) .filter(s => s !== subject) .sort((a, b) => (a < b) ? -1 : ((a > b) ? 1 : 0)); for (const schedule_data of schedules_root) { const compatible_schedules = new Map<string, ueaData>(); if (!schedule_data.teacher.name) break; compatible_schedules.set(subject, schedule_data); filteres_subjects.forEach((subject_iteration, i) => { const schedules_iteration = subjects.get(subject_iteration) as ueaData[]; for (const schedule_curr of schedules_iteration) { let bool_val = true; for (const [, uea] of compatible_schedules) { for (const day of week) { const uea_day = uea[day]; const curr_day = schedule_curr[day]; if (uea_day && curr_day) { if ( uea_day.starts === curr_day.starts || uea_day.ends === curr_day.ends || (curr_day.starts > uea_day.starts && curr_day.starts < uea_day.ends) || (curr_day.ends > uea_day.starts && curr_day.ends < uea_day.ends) || (uea_day.starts > curr_day.starts && uea_day.starts < curr_day.ends) || (uea_day.ends > curr_day.starts && uea_day.ends < curr_day.ends) ) { bool_val = false; break; } } } } if (!!bool_val && !!schedule_curr.teacher.name) { compatible_schedules.set(subject_iteration, schedule_curr); break; } } }); if (compatible_schedules.size == subjects.size) { const sorted_compatible = Array.from(compatible_schedules) .sort(([a], [b]) => (a < b) ? -1 : ((a > b) ? 1 : 0)); const SCHEDULECODE = sorted_compatible.reduce((prev, [subject_code, schedule_code]) => { prev += schedule_code.group; prev += subject_code.substring(0, 3).toUpperCase(); prev += subject_code.slice(-3).toUpperCase(); prev += schedule_code.teacher.name.substring(0, 3).toUpperCase(); prev += schedule_code.teacher.name.slice(-3).toUpperCase(); return prev; }, '').padEnd(subjects.size * 18, '0'); all_combinations.set(SCHEDULECODE, new Map(sorted_compatible)); } } } let sorted_iterations = Array.from(all_combinations).sort(([, schedule_a], [, schedule_b]) => { const a = Array.from(schedule_a).reduce((prev, [, schedule_percent]) => { return prev += schedule_percent.teacher.quality; }, 0) / schedule_a.size; const b = Array.from(schedule_b).reduce((prev, [, schedule_percent]) => { return prev += schedule_percent.teacher.quality; }, 0) / schedule_b.size; return a - b; }).reverse(); if (await confirm('Do you want to prioritize a teacher')) { const teacher_list = await selectTeachers(teachers); sorted_iterations = sorted_iterations.filter(([, schedule_filter]) => { const filtered = Array.from(schedule_filter).filter(([, { teacher }]) => teacher_list.includes(teacher.name)); return filtered.length >= teacher_list.length; }); if (sorted_iterations.length < 1) throw new Error("No combinations lefts with this teachers"); } const { subjects_info, hours } = await selectSchedule(sorted_iterations); const doc = new PDF(); const max_size = Math.max.apply(Math, subjects_info.map(({ teacher }) => teacher.length)) const x = 50; const x_mins = doc.page.width - x; if (!existsSync(config.PDF.DIRECTORY)) mkdirSync(config.PDF.DIRECTORY); doc.pipe(createWriteStream(config.PDF.SCHEDULE)); doc.image('assets/icon-1024x1024.png', x, 50, { width: 30 }) .fontSize(22.5) .text('Horario hecho por Cétr!co.Productions', x + 40, 56); doc.moveDown(); subjects_info.forEach(({ key, subject, teacher }) => { const y = doc.y; let t; doc.fontSize(10) .text(key, x, y) .text(teacher, t = x + key.length * 6, y) .text(subject, t + max_size * 5.5, y) }); doc.moveDown(); const y_start = doc.y; const hour_size = 5 * 8; const average_width = (x_mins - x - hour_size) / 5; const margin_top = 4.5; doc.moveTo(x, y_start).lineTo(x_mins, y_start).stroke(); week.reduce((size, day) => { doc.text(day, size, y_start + margin_top, { width: average_width, align: 'center' }); size += average_width; return size; }, x + hour_size); doc.moveTo(x, doc.y).lineTo(x_mins, doc.y).stroke(); Object.keys(hours).forEach(hour => { const y = doc.y; doc.text(hour, x, y + margin_top, { width: hour_size, align: 'center' }) week.reduce((size, day) => { const hour_day = hours[hour][day]; if (hour_day) doc.text(hour_day, size, y + margin_top, { width: average_width, align: 'center' }); size += average_width; return size; }, x + hour_size); doc.moveTo(x, doc.y).lineTo(x_mins, doc.y).stroke(); }); doc.moveTo(x, y_start).lineTo(x, doc.y).stroke(); doc.moveTo(x_mins, y_start).lineTo(x_mins, doc.y).stroke(); doc.end(); const command = platform.startsWith('win') ? 'start' : platform === 'darwin' ? 'open' : 'xdg-open'; execSync(command + ' ' + config.PDF.SCHEDULE) if (!await confirm('Do you want to keep the uea backup')) unlinkSync(config.JSONUEA); process.exit(0); } catch (e) { console.error(e); process.exit(1); } finally { log(`Done`); } })(process);