@padrocha/uam-scraping
Version:
Scraping of teachers
417 lines (356 loc) • 20.3 kB
text/typescript
#!/usr/bin/env node
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync, createWriteStream } from 'fs';
import puppeteer from 'puppeteer';
import PDF from 'pdfkit';
import { execSync } from 'child_process';
import { config } from './config';
import { week, log, confirm, progressBar, selectUEAS, tryDOM, timeParse, selectSchedule, selectTeachers, password, askUser } from './utils';
(async ({ argv, platform, stdout }: NodeJS.Process) => {
try {
const ueas = new Array<uea>();
let subjects = new Map<string, ueaSchedule[]>();
let teachers = new Set<string>();
if (existsSync(config.JSONUEA)) {
log('Uea´s backup found');
if (await confirm('Do you wanna use it')) {
log('Reading Uea´s')
const { schedules }: { schedules: [string, ueaData[]][] } = JSON.parse(readFileSync(config.JSONUEA, 'utf8'));
if (schedules) {
subjects = new Map<string, ueaData[]>(schedules);
teachers = Array.from(subjects as Map<string, ueaData[]>).reduce((prev, [, schedule]) => {
schedule.forEach(({ teacher }) => {
return teacher.name.length > 1 ? prev.add(teacher.name) : false;
});
return prev;
}, new Set<string>());
}
} else {
stdout.moveCursor(0, -1);
stdout.clearScreenDown();
}
}
if (subjects.size < 1) {
log('Loading page: SIIUAM');
const browser = await puppeteer.launch({ headless: config.HEADLESS });
const SIIUAM = await browser.newPage();
await SIIUAM.goto(config.PATH.SIIUAM, { waitUntil: 'load', timeout: 0 });
SIIUAM.setDefaultNavigationTimeout(config.NAVIGATION);
const bodyFrame = await SIIUAM.$('frame#bodyFrame');
const body_frame = await bodyFrame?.contentFrame();
const controlFrame = await body_frame?.$('frame#controlFrame');
const control_frame = await controlFrame?.contentFrame();
const menu_frame = await tryDOM<puppeteer.Frame>(async () => {
const menuFrame = await control_frame?.$('frame#menuFrame');
return await menuFrame?.contentFrame();
}, SIIUAM);
const USER = config.USER ? config.USER : await askUser();
const PASS = config.PASS ? config.PASS : await password();
log('SIIUAM > Loggin user');
await menu_frame.type('input[name="NOMBRE.IDENTIFICACION.NONMODELED"]', USER);
await menu_frame.type('input[name="COMPLEMENTO.IDENTIFICACION.NONMODELED"]', PASS);
await menu_frame.click('input[name="GO.IDENTIFICACION.NONMODELED"]');
log('Fetching UEA Schedules');
await menu_frame.waitForNavigation();
await menu_frame.click('a[href="CTWBS012"]');
const info_frame = await tryDOM(async () => {
const infoFrame = await body_frame?.$('frame#infoFrame');
return await infoFrame?.contentFrame();
}, SIIUAM);
const uea_disponible = await tryDOM(async () => {
const ifrmBol = await info_frame.$('iframe#ifrm_bol');
const ifrm_bol = await ifrmBol?.contentFrame();
const _frame = await ifrm_bol?.$('iframe');
const frame = await _frame?.contentFrame();
return await frame?.$$('.celda.uea_disponible');
}, SIIUAM);
for await (const uea of uea_disponible) {
const id = await uea.getProperty('id');
const key = await id?.jsonValue() as string;
const _name = await uea.$('.nombre');
const name = await _name?.evaluate(e => e.textContent) as string;
ueas.push({ key, name });
}
if (ueas.length < 1)
throw new Error("Uea´s not found");
const ueas_selected = await selectUEAS(ueas);
const horario_UEA = await tryDOM(async () => {
return await menu_frame.$('a[href="IEWBC005.oConsulta"]');
}, SIIUAM);
await horario_UEA.click();
await info_frame.waitForNavigation();
let count_ueas = 0;
progressBar();
for await (const uea of ueas_selected) {
await info_frame.type('input[name="CD_UEA.CONTROL.NONMODELED"]', uea.key + '\n');
await info_frame.waitForNavigation();
const tr_array = await tryDOM(async () => {
const fieldset = await info_frame.$('fieldset');
return await fieldset?.$$("tr");
}, SIIUAM);
const schedule = new Array<ueaSchedule>();
tr_array.shift();
for await (const tr of tr_array) {
const td_array = await tryDOM(async () => {
return await tr.$$('td');
}, SIIUAM);
const teacher_handle = await td_array[0].getProperty('innerText');
const teacher = await teacher_handle?.jsonValue() as string;
const group_handle = await td_array[1].getProperty('innerText');
const group = await group_handle?.jsonValue() as string;
const monday_handle = await td_array[4].getProperty('innerText');
const monday = await monday_handle?.jsonValue() as string;
const tuesday_handle = await td_array[5].getProperty('innerText');
const tuesday = await tuesday_handle?.jsonValue() as string;
const wednesday_handle = await td_array[6].getProperty('innerText');
const wednesday = await wednesday_handle?.jsonValue() as string;
const thursday_handle = await td_array[7].getProperty('innerText');
const thursday = await thursday_handle?.jsonValue() as string;
const friday_handle = await td_array[8].getProperty('innerText');
const friday = await friday_handle?.jsonValue() as string;
schedule.push({
key: uea.key,
teacher,
group,
monday: timeParse(monday),
tuesday: timeParse(tuesday),
wednesday: timeParse(wednesday),
thursday: timeParse(thursday),
friday: timeParse(friday)
});
}
subjects.set(uea.name, schedule);
progressBar(ueas_selected.length, ++count_ueas);
await info_frame.click('input[name="CD_UEA.CONTROL.NONMODELED"]', { clickCount: 3 });
}
if (subjects.size < 1)
throw new Error("Subjects not found");
teachers = Array.from(subjects).reduce((prev, [, schedule]) => {
schedule.forEach(({ teacher }) => {
return (teacher as string).length > 1 ? prev.add(teacher as string) : false;
});
return prev;
}, new Set<string>());
const teacherData = new Map<string, teacherData>();
log('Closing page: SIIUAM');
await SIIUAM.close();
log('Loading page: misprofesores');
const MISPROFESORES = await browser.newPage();
await MISPROFESORES.goto(config.PATH.MISPROFESORES, { waitUntil: 'load', timeout: 0 });
MISPROFESORES.setDefaultNavigationTimeout(config.NAVIGATION);
log('Fetching teachers data');
progressBar();
let count_teachers = 0;
for await (const teacher of teachers) {
await MISPROFESORES.waitForSelector('input[name="q"]');
await MISPROFESORES.type('input[name="q"]', teacher + '\n');
await MISPROFESORES.waitForSelector('a.gs-title');
const results = await MISPROFESORES.$$('.gsc-webResult.gsc-result');
if (results)
for await (const result of results) {
const a = await result?.$('a.gs-title');
const teacher_path_handle = await a?.getProperty('href');
const teacher_path = await teacher_path_handle?.jsonValue() as string;
const DATA = await browser.newPage();
await DATA.goto(teacher_path, { waitUntil: 'load', timeout: 0 });
DATA.setDefaultNavigationTimeout(15_000_000);
try {
const quality_element = await DATA.$('.quality .grade');
const quality_handle = await quality_element?.getProperty('innerText');
const quality_text = await quality_handle?.jsonValue() as string;
const quality = Number(quality_text);
const takeAgain_element = await DATA.$('.takeAgain .grade');
const takeAgain_handle = await takeAgain_element?.getProperty('innerText');
const takeAgain_text = await takeAgain_handle?.jsonValue() as string;
const takeAgain = Number(takeAgain_text?.slice(0, -1));
const difficulty_element = await DATA.$('.difficulty .grade');
const difficulty_handle = await difficulty_element?.getProperty('innerText');
const difficulty_text = await difficulty_handle?.jsonValue() as string;
const difficulty = Number(difficulty_text);
const students_element = await DATA.$('.rating-count');
const students_handle = await students_element?.getProperty('innerText');
const students_text = await students_handle?.jsonValue() as string;
const students = Number(students_text.trim().split(/\s/).shift());
teacherData.set(teacher, {
name: teacher,
quality,
takeAgain,
difficulty,
students
});
++count_teachers;
await DATA?.close();
break;
} catch {
await DATA?.close();
teachers.delete(teacher);
}
}
progressBar(teachers.size, count_teachers);
}
if (teacherData.size < 1)
throw new Error("Teachers data not found");
log('Closing page: misprofesores');
await MISPROFESORES.close();
log('Closing browser');
await browser.close();
subjects = Array.from(subjects).reduce((prev, [subject, schedule]) => {
return prev.set(subject, schedule.map(uea => {
uea.teacher = teacherData.has(uea.teacher as string)
? teacherData.get(uea.teacher as string) as teacherData
: {
name: '',
quality: 0,
takeAgain: 0,
difficulty: 0,
students: 0
};
return uea as ueaData;
}).sort(({ teacher: a }, { teacher: b }) => {
const sort_quality = (a.quality < b.quality) ? -1 : ((a.quality > b.quality) ? 1 : 0);
const sort_students = (a.students < b.students) ? -1 : ((a.students > b.students) ? 1 : 0);
return sort_quality || sort_students;
}).reverse());
}, new Map<string, ueaData[]>());
const JSONUEA = JSON.stringify(subjects, (key: string, value: Map<string, ueaSchedule[]>) => {
return value instanceof Map
? {
subjects: Array.from(value.entries()).map(([k]) => k),
schedules: Array.from(value.entries())
}
: value;
});
writeFileSync(config.JSONUEA, JSONUEA, 'utf-8');
log(`Uea´s data saved as "${config.JSONUEA}"`);
}
const all_combinations = new Map<string, Map<string, ueaData>>();
log('Choosing the most optimal schedules');
for (const [subject, schedules_root] of subjects as Map<string, ueaData[]>) {
const filteres_subjects = Array.from(subjects.keys())
.filter(s => s !== subject)
.sort((a, b) => (a < b) ? -1 : ((a > b) ? 1 : 0));
for (const schedule_data of schedules_root) {
const compatible_schedules = new Map<string, ueaData>();
if (!schedule_data.teacher.name)
break;
compatible_schedules.set(subject, schedule_data);
filteres_subjects.forEach((subject_iteration, i) => {
const schedules_iteration = subjects.get(subject_iteration) as ueaData[];
for (const schedule_curr of schedules_iteration) {
let bool_val = true;
for (const [, uea] of compatible_schedules) {
for (const day of week) {
const uea_day = uea[day];
const curr_day = schedule_curr[day];
if (uea_day && curr_day) {
if (
uea_day.starts === curr_day.starts ||
uea_day.ends === curr_day.ends ||
(curr_day.starts > uea_day.starts && curr_day.starts < uea_day.ends) ||
(curr_day.ends > uea_day.starts && curr_day.ends < uea_day.ends) ||
(uea_day.starts > curr_day.starts && uea_day.starts < curr_day.ends) ||
(uea_day.ends > curr_day.starts && uea_day.ends < curr_day.ends)
) {
bool_val = false;
break;
}
}
}
}
if (!!bool_val && !!schedule_curr.teacher.name) {
compatible_schedules.set(subject_iteration, schedule_curr);
break;
}
}
});
if (compatible_schedules.size == subjects.size) {
const sorted_compatible = Array.from(compatible_schedules)
.sort(([a], [b]) => (a < b) ? -1 : ((a > b) ? 1 : 0));
const SCHEDULECODE = sorted_compatible.reduce((prev, [subject_code, schedule_code]) => {
prev += schedule_code.group;
prev += subject_code.substring(0, 3).toUpperCase();
prev += subject_code.slice(-3).toUpperCase();
prev += schedule_code.teacher.name.substring(0, 3).toUpperCase();
prev += schedule_code.teacher.name.slice(-3).toUpperCase();
return prev;
}, '').padEnd(subjects.size * 18, '0');
all_combinations.set(SCHEDULECODE, new Map(sorted_compatible));
}
}
}
let sorted_iterations = Array.from(all_combinations).sort(([, schedule_a], [, schedule_b]) => {
const a = Array.from(schedule_a).reduce((prev, [, schedule_percent]) => {
return prev += schedule_percent.teacher.quality;
}, 0) / schedule_a.size;
const b = Array.from(schedule_b).reduce((prev, [, schedule_percent]) => {
return prev += schedule_percent.teacher.quality;
}, 0) / schedule_b.size;
return a - b;
}).reverse();
if (await confirm('Do you want to prioritize a teacher')) {
const teacher_list = await selectTeachers(teachers);
sorted_iterations = sorted_iterations.filter(([, schedule_filter]) => {
const filtered = Array.from(schedule_filter).filter(([, { teacher }]) => teacher_list.includes(teacher.name));
return filtered.length >= teacher_list.length;
});
if (sorted_iterations.length < 1)
throw new Error("No combinations lefts with this teachers");
}
const { subjects_info, hours } = await selectSchedule(sorted_iterations);
const doc = new PDF();
const max_size = Math.max.apply(Math, subjects_info.map(({ teacher }) => teacher.length))
const x = 50;
const x_mins = doc.page.width - x;
if (!existsSync(config.PDF.DIRECTORY))
mkdirSync(config.PDF.DIRECTORY);
doc.pipe(createWriteStream(config.PDF.SCHEDULE));
doc.image('assets/icon-1024x1024.png', x, 50, { width: 30 })
.fontSize(22.5)
.text('Horario hecho por Cétr!co.Productions', x + 40, 56);
doc.moveDown();
subjects_info.forEach(({ key, subject, teacher }) => {
const y = doc.y;
let t;
doc.fontSize(10)
.text(key, x, y)
.text(teacher, t = x + key.length * 6, y)
.text(subject, t + max_size * 5.5, y)
});
doc.moveDown();
const y_start = doc.y;
const hour_size = 5 * 8;
const average_width = (x_mins - x - hour_size) / 5;
const margin_top = 4.5;
doc.moveTo(x, y_start).lineTo(x_mins, y_start).stroke();
week.reduce((size, day) => {
doc.text(day, size, y_start + margin_top, { width: average_width, align: 'center' });
size += average_width;
return size;
}, x + hour_size);
doc.moveTo(x, doc.y).lineTo(x_mins, doc.y).stroke();
Object.keys(hours).forEach(hour => {
const y = doc.y;
doc.text(hour, x, y + margin_top, { width: hour_size, align: 'center' })
week.reduce((size, day) => {
const hour_day = hours[hour][day];
if (hour_day)
doc.text(hour_day, size, y + margin_top, { width: average_width, align: 'center' });
size += average_width;
return size;
}, x + hour_size);
doc.moveTo(x, doc.y).lineTo(x_mins, doc.y).stroke();
});
doc.moveTo(x, y_start).lineTo(x, doc.y).stroke();
doc.moveTo(x_mins, y_start).lineTo(x_mins, doc.y).stroke();
doc.end();
const command = platform.startsWith('win') ? 'start' : platform === 'darwin' ? 'open' : 'xdg-open';
execSync(command + ' ' + config.PDF.SCHEDULE)
if (!await confirm('Do you want to keep the uea backup'))
unlinkSync(config.JSONUEA);
process.exit(0);
} catch (e) {
console.error(e);
process.exit(1);
} finally {
log(`Done`);
}
})(process);