festivals-importer
Version:
358 lines (279 loc) • 8.72 kB
JavaScript
var fs = require('fs');
var async = require('async');
var request = require('request');
var cheerio = require('cheerio');
var moment = require('moment-timezone');
var days = {
day1: '2015-10-03',
day2: '2015-10-04'
};
var categoriesMapper = {
'spotkania': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/prelekcje.svg',
'inne': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/other_events.svg',
'warsztaty': 'http://komiksfestiwal.com/wp-content/uploads/warsztaty_black.svg',
'star wars': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/star_wars.svg',
'autografy': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/strefa_autografow.svg',
'strefa dziecieca': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/strefa_dziecieca_dziewczynka.svg',
'gry planszowe': 'http://komiksfestiwal.com/wp-content/uploads/2015/08/gry_planszowe.svg'
};
var loadedEvents = {};
var removeTags = function removeTags(val) {
var regex = /(<([^>]+)>)/ig;
return val.replace(regex, '');
};
var descriptionParser = function descriptionParser(html) {
html = removeTags(html);
html = html.trim();
return html;
};
var getHour = function getHour(val) {
if (val) {
var integer = parseInt(val);
var float = parseFloat(val);
var diff = parseFloat(float - integer);
if (diff) {
return integer + ':' + diff * 60;
}
}
return val;
};
var durationParser = function durationParser(day, timeStart, timeEnd) {
var startAt = day + ' ' + getHour(timeStart);
var finishAt = day + ' ' + getHour(timeEnd);
const start = moment(startAt);
const endAt = moment(finishAt);
return {
startAt: start.toISOString(),
finishAt: endAt.toISOString()
};
};
var entitiesParser = function nameParser(val) {
val = val.replace(/(<p>|<\/p>|<span.+?>|<\/span>|<p.+?\">)/g, '');
val = val.replace(/ /g, ' ');
val = val.replace(/&/g, '&');
val = val.replace(/ó/gi, 'ó');
val = val.replace(/'/g, "'");
val = val.replace(/…/g, '...');
val = val.replace(/(–|—)/g, '-');
val = val.replace(/("|“|”|„)/g, '"');
return val.trim();
};
var categoriesParser = function categoriesParser(val) {
val = val || '';
val = val.trim();
var cat;
for (i in categoriesMapper) {
if (val === categoriesMapper[i]) {
cat = i;
break;
}
}
if (!cat) {
cat = 'inne';
console.log('couldn\'t find category for ' + val);
}
var categories = [];
if (val) {
const category = {
parent: null,
name: cat
};
categories.push(category);
}
return categories;
};
var placesParser = function placesParser(val) {
val = val || '';
val = val.trim();
var places = [];
if (val) {
const place = {
parent: null,
name: val,
openingTimes: []
};
places.push(place);
}
return places;
};
var authorsParser = function authorsParser(name, description, val) {
val = val || '';
val = val.trim();
var result = [];
if (val) {
val = val.replace(name, '');
val = val.replace('-', '');
val = val.trim();
if (val) {
//console.log('val', val);
var authors = val.split(',');
//console.log('authors', authors);
for (var i in authors) {
var author = authors[i];
author = author.replace('prowadzenie ', '');
author = author.replace('Warsztaty Komiksu: ', '');
var el = {
name: author.trim(),
organization: null
};
result.push(el);
}
}
}
var regex = /.*\–? ?prowadzenie (.+)/;
var matches = description.match(regex);
if (matches) {
var re = /(, | i | oraz )/;
var authors = matches[1].split(re).filter(function (el) {
return el !== ', ' && el !== ' i ' && el !== ' oraz ';
});
for (var i in authors) {
var author = authors[i];
var el = {
name: author.trim(),
organization: null
};
result.push(el);
}
}
return result;
};
var nameParser = function nameParser(val) {
val = val || '';
val = val.trim();
if (val) {
var nameSplit = val.split('-');
if (nameSplit.length > 1) {
var removed = nameSplit.pop();
if (
removed.indexOf('Zin Zin Press') === -1 &&
removed.indexOf('Nowe rzeczywistości') === -1 &&
removed.indexOf('Warsztaty Komiksu dla') === -1 &&
removed.indexOf('warsztaty komiksowe z WawaKomiks') === -1 &&
removed.indexOf('lecie Bitew Komiksowych') === -1 &&
removed.indexOf('Komiks w sitodruku') === -1 &&
removed.indexOf('ostatnie lata w największym') === -1 &&
removed.indexOf('plany wydawnicze') === -1 &&
removed.indexOf('wręczenie nagród') === -1 &&
removed.indexOf('rodziny') === -1 &&
removed.indexOf('Nocny Spacer') === -1
) {
val = nameSplit.join('-');
}
}
}
return val.trim();
};
var parseEvents = function parseEvents(body, day, callback) {
body = entitiesParser(body);
var $ = cheerio.load(body, {decodeEntities: false});
var html = $('div.table');
var datas = [];
$('div.event', html).each(function (i, elem) {
var data = $(elem).data();
if (data.hasOwnProperty('id')) {
if (!loadedEvents.hasOwnProperty(data.id)) {
loadedEvents[data.id] = true;
data.day = day;
datas.push(data);
}
else {
console.log('skip data', data);
}
}
else {
console.log('missing "id" in data', data);
}
});
async.map(datas, getEventContent, callback);
};
var parseEvent = function parseEvent(body, id, day, timeStart, timeEnd) {
//console.log('parseEvent:', id, day, timeStart, timeEnd);
var $ = cheerio.load(body, {decodeEntities: false});
var data = {
name: '',
description: null,
duration: null,
authors: [],
places: [],
categories: [],
metadata: [],
images: [],
tags: []
};
data.name = nameParser($('.pop-up-bottom-title').text());
data.duration = durationParser(day, timeStart, timeEnd);
data.categories = categoriesParser($('.pop-up-bottom-header p').children('img').last().attr('src'));
data.places = placesParser($('.pop-up-top-right div p strong').text());
data.description = descriptionParser($('.pop-up-bottom-content').text());
data.authors = authorsParser(data.name, data.description, $('.pop-up-bottom-title').text());
data.metadata.push(id + '');
return data;
};
var loadDayContent = function loadDayContent(day, callback) {
//console.log('loadDayContent: ' + day);
request('http://komiksfestiwal.com/program/?event-day=' + day, function (error, response, body) {
//console.log('loadDayContent resp', error, response.statusCode);
if (!error && response.statusCode == 200) {
return parseEvents(body, days[day], callback);
}
return callback(error, null);
});
};
var getEventsContent = function getEventsContent(callback) {
async.map(Object.keys(days), loadDayContent, function (err, results) {
if (err) {
return callback(err, null);
}
//console.log('getEventsContent', err, results);
return callback(err, results[0].concat(results[1]));
});
};
var getEventContent = function getEventContent(data, callback) {
request.post('http://komiksfestiwal.com/wp-content/themes/template/templates/events-programme/pop-ups/pop-up.php', {
form: {
type: 'event',
ID: data.id
}
}, function (error, response, body) {
//console.log('getEventContent resp', error, response.statusCode);
if (!error && response.statusCode == 200) {
var event = parseEvent(body, data.id, data.day, data.timeStart, data.timeEnd);
//console.log('event ', event);
if (event) {
return callback(null, event);
}
return callback(null, null);
}
return callback(error, null);
});
};
var importEvents = function importEvents(callback) {
return getEventsContent(callback);
};
var handle = function handle(data, callback) {
importEvents(function (err, results) {
return callback(null, {events: results});
});
};
module.exports = {
parseEvent: parseEvent,
parseEvents: parseEvents,
getEventsContent: getEventsContent,
importEvents: importEvents,
handle: handle
};
//
//importEvents(function (err, results) {
// fs.writeFile('komiksfestiwal2015.json', JSON.stringify(results), function (err) {
// if (err) {
// throw err;
// }
// //
// // console.log('It\'s saved!');
// //console.log(err, results);
// console.dir(results, {depth: null});
// //console.dir(results[0], {depth: null});
// });
//
//});