google-carousel-scraper
Version:
Scrape the Google mobile (AMP) carousel
106 lines (79 loc) • 3.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
var _regenerator = require('babel-runtime/regenerator');
var _regenerator2 = _interopRequireDefault(_regenerator);
var _promise = require('babel-runtime/core-js/promise');
var _promise2 = _interopRequireDefault(_promise);
var _asyncToGenerator2 = require('babel-runtime/helpers/asyncToGenerator');
var _asyncToGenerator3 = _interopRequireDefault(_asyncToGenerator2);
var _zombie = require('zombie');
var _zombie2 = _interopRequireDefault(_zombie);
var _fuzzyDateParseNaive = require('@quarterto/fuzzy-date-parse-naive');
var _fuzzyDateParseNaive2 = _interopRequireDefault(_fuzzyDateParseNaive);
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
var IPHONE_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1';
function getSearchUrl(searchTerm) {
return 'https://www.google.co.uk/search?q=' + encodeURIComponent(searchTerm);
}
exports.default = function () {
var ref = (0, _asyncToGenerator3.default)(_regenerator2.default.mark(function _callee(searchTerm) {
var options = arguments.length <= 1 || arguments[1] === undefined ? { timeout: 15000 } : arguments[1];
var url, browser, timeout, timeoutPromise, links;
return _regenerator2.default.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
url = getSearchUrl(searchTerm);
_zombie2.default.waitDuration = 999999;
browser = new _zombie2.default({ userAgent: IPHONE_AGENT });
timeout = void 0;
timeoutPromise = new _promise2.default(function (resolve, reject) {
var err = new Error('Search \'' + searchTerm + '\' timed out');
err.timeout = true;
timeout = setTimeout(reject, options.timeout, err);
});
_context.prev = 5;
_context.prev = 6;
_context.next = 9;
return _promise2.default.race([browser.visit(url), timeoutPromise]);
case 9:
_context.next = 15;
break;
case 11:
_context.prev = 11;
_context.t0 = _context['catch'](6);
if (!_context.t0.timeout) {
_context.next = 15;
break;
}
throw _context.t0;
case 15:
// 🙈
links = browser.queryAll('[data-ampgroup=true] a[data-amp]').map(function (link) {
return {
link: link.getAttribute('data-amp'),
title: link.lastElementChild.textContent,
date: (0, _fuzzyDateParseNaive2.default)(link.nextSibling.textContent),
publisher: link.getAttribute('data-amp-title')
};
});
return _context.abrupt('return', links);
case 17:
_context.prev = 17;
clearTimeout(timeout);
browser.destroy();
return _context.finish(17);
case 21:
case 'end':
return _context.stop();
}
}
}, _callee, this, [[5,, 17, 21], [6, 11]]);
}));
return function (_x, _x2) {
return ref.apply(this, arguments);
};
}();
module.exports = exports['default'];