UNPKG

facebook-group-posts-scraper

Version:
681 lines (631 loc) 20.6 kB
#!/usr/bin/env node const puppeteer = require('puppeteer'); const selectors = require('./selectors/facebook'); const fs = require('fs'); const inquirer = require('inquirer'); const minimist = require('minimist'); const chalk = require('chalk'); const Configstore = require('configstore'); const package = require('../package.json'); const config = new Configstore( package.name, {}, ); const arguments = minimist( process.argv.slice(2), { string: ['group-ids', 'output'], boolean: ['version', 'help', 'debug', 'headful'], _: ['init'], default: {'output': './'}, alias: {h: 'help', v: 'version'}, stopEarly: true, /* populate _ with first non-option */ }, ); /** * Function handles the validation of a string. * @namespace validator * @param {string} input the input parameter to validate * @return {bool} returns true if the given input is valid **/ function validator(input) { return input.length !== 0; } /** * This callback type is called `validatorCallback` and * is displayed as a global symbol. * @callback validatorCallback * @param {string} input * @return {boolean} returns true if the input is valid */ /** * Function gets the user configuration information by asking * related questions to user. * @namespace askConfigQuestions * @param {validatorCallback} validator function to validate the user input * @return {Object} returns answer object got from the user **/ async function askConfigQuestions(validator) { const answers = await inquirer.prompt([ { name: 'facebook-username', type: 'input', message: 'facebook username:', validate: validator, }, { name: 'facebook-password', type: 'password', message: 'password:', validate: validator, }, ]); return answers; } /** * This callback type is called askQuestionsFunction callback and * is displayed as a global symbol * @callback askQuestionsFunction * @param {validatorCallback} validator * @return {Object} returns answer object got from the user */ /** * Function handles the user configuration in CLI * @namespace userConfig * @param {askQuestionsFunction} askQuestionsFunction * @param {validatorCallback} validator * @return {void} reutrns nothing but handles the user configuration acton **/ async function userConfig(askQuestionsFunction, validator) { const answers = await askConfigQuestions(validator); config.set({ username: answers['facebook-username'], password: answers['facebook-password'], }); } /** * Function show a help page line on the console. * @namespace helpPageLine * @param {string} command command name to show * @param {string} description command description to show * @return {void} returns nothing but shows the help page line on the console **/ function helpPageLine(command, description) { const magenta = chalk.magenta; console.info(' ' + magenta(command) + ': ' + description); } /** * This callback type is called 'helpPageLineCallback and * is displayeed as global symbol * @callback helpPageLineCallback * @param {string} command command name to show * @param {string} description command description to show * @return {void} returns nothing but shows the help line on the console */ /** * Function shows help page. * @namespace help * @param {helpPageLineCallback} helpPageLine Function that logs a help page line with given parameters * @return {void} returns nothing but shows help page on the console. **/ function help(helpPageLine) { console.info('Available options:'); helpPageLine( '--group-ids', ' Indicates which groups ids that we want to' + ' scrape (seperated by commas)', ); helpPageLine('-h, --help', ' Shows the help page'); helpPageLine('-v, --version', 'Shows the CLI version'); helpPageLine('--output', ' Specify the output folder destination'); helpPageLine('--headful', ' Disable headless mode'); console.info('Available commands:'); helpPageLine('init', ' Initialize user configuration'); } /** * Function shows error message. * @namespace error * @param {string} message message to display. $ @return {void} returns nothing but shows an error message on the console **/ function error(message) { console.error( chalk.bold.red('ERROR:') + ' ' + message, ); } /** * function shows the version of CLI. * @namespace version * @return {void} returns nothing but shows CLI version on console **/ function version() { console.log(package.version); } /** * function shows if user configured or not. * @namespace isUserConfigured * @return {bool} returns if user configured or not. **/ function isUserConfigured() { return ( config.get('username') !== undefined && config.get('username') !== null && config.get('password') !== undefined && config.get('password') !== null ); } /** * Function sleeps the current process for given number of milliseconds * @namespace sleep * @param {int} time parameter description * @return {void} returns nothing but sleeps for time ms **/ async function sleep(time) { return new Promise(function(resolve) { setTimeout(resolve, time); }); } /** * This callback type is called 'sleepFunctionCallback' and * displayed as a global type * @callback sleepFunctionCallback * @param {int} time The number of ms that we want to sleep for * @return {void} returns nothing but sleeps the current process for * given number of milliseconds */ /** * function scrolls the page. * @namespace autoScroll * @param {Page} page the current page opened on browser * @param {sleepFunctionCallback} sleep The function used for sleeping the current process * @return {void} returns nothing but scrolls the page. **/ async function autoScroll(page, sleep) { await page.evaluate(async () => { /** * Function sleeps the current process for given number of milliseconds * @namespace sleep * @param {int} time parameter description * @return {void} returns nothing but sleeps for time ms **/ async function sleep(time) { return new Promise(function(resolve) { setTimeout(resolve, time); }); } for (let i = 0; i < Math.round((Math.random() * 10) + 10); i++) { window.scrollBy(0, document.body.scrollHeight); await sleep( Math.round( (Math.random() * 4000) + 1000, ), ); } Promise.resolve(); }); } /** * Funciton generates the Facebook group URL from the given group id. * @namespace generateFacebookGroupUrlFromId * @param {string} groupId facebook group id * @return {string} returns the Facebook group url * related to the given Facebook group id **/ function generateFacebookGroupUrlFromId(groupId) { return 'https://m.facebook.com/groups/' + groupId + '/'; } /** * function creates a browser instance. * @namespace createBrowser * @param {Object} arguments Comamnd line arguments parsed from user input * @return {Browser} returns the Browser object **/ async function createBrowser(arguments) { const browserOptions = { headless: arguments['headful'] === false, args: [ '--no-sandbox', '--disable-setuid-sendbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', ], }; if (process.arch === 'arm' || process.arch === 'arm64') { // If processor architecture is arm or arm64 we need to use chromium browser browserOptions.executablePath = 'chromium-browser'; } const browser = await puppeteer.launch(browserOptions); return browser; } /** * Function creates an incognito page from the given browser instance. * @namespace incognitoMode * @param {Browser} browser The browser object that we want to create the incognito page * @return {Page} returns the page in the incognito mode **/ async function incognitoMode(browser) { /** * We need an incognito browser to avoid notification * and location permissions of Facebook **/ const incognitoContext = await browser.createIncognitoBrowserContext(); // Creates a new borwser tab const page = await incognitoContext.newPage(); return page; } /** * Funciton sets the listeners to avoid to load unnecessary content. * @namespace setPageListeners * @param {Page} page The current page of the browser * @return {void} returns nothing but configures listeners on the given page to avoid to load * unnecessart content **/ async function setPageListeners(page) { await page.setRequestInterception(true); const blockResources = [ 'image', 'media', 'font', 'textrack', 'object', 'beacon', 'csp_report', 'imageset', ]; page.on('request', (request) => { const rt = request.resourceType(); if ( blockResources.indexOf(rt) > 0 || request.url().match(/\.((jpe?g)|png|gif)/) != null ) { request.abort(); } else { request.continue(); } }); } /** * The callback function called 'setPageListenersCallback and * displayed as a global type * @callback setPageListenersCallback * @param {Page} page The page that we set our listeners on * @return {void} Returns nothing but sets the listeners on the given page */ /** * Function handles the Frabook login of the user. * @namespace facebookLogin * @param {Object} arguments command line arguments parsed with minimist * @param {Page} page the incognito page that we are using for login * @param {setPageListenersCallback} setPageListeners the function that sets the page listeners to speed up * @return {Page} returns the page when the user logged in **/ async function facebookLogIn(arguments, page, setPageListeners) { // Goes to base facebook url await page.goto('https://facebook.com'); try { await page.waitForXPath('//button[@data\-cookiebanner="accept_button"]'); const acceptCookiesButton = ( await page.$x('//button[@data\-cookiebanner="accept_button"]') )[0]; await page.evaluate((el) => { el.focus(); el.click(); }, acceptCookiesButton); } catch (error) { console.info('Cookie banner did not appear'); } /** * Waiting for login form JQuery selector to avoid * that forms elements to be not found **/ await page.waitForSelector(selectors.login_form.parent); // Focusing to the email input await page.focus(selectors.login_form.email); // Clicking on the email form input to be able to type on input await page.focus(selectors.login_form.email); // Typing on the email input the email address await page.keyboard.type(config.get('username')); // Focusing on the password input await page.focus(selectors.login_form.password); // Typing the facebook password on password input await page.keyboard.type(config.get('password')); // Clicking on the submit button await page.waitForXPath('//button[@data\-testid="royal_login_button"]'); const [loginButton] = await page.$x( '//button[@data\-testid="royal_login_button"]', ); await page.evaluate((el) => { el.click(); }, loginButton); await page.waitForXPath('//div[@data\-pagelet="Stories"]'); await setPageListeners(page); return page; } /** * function gets old publications. * @namespace getOldPublications * @param {type} fileName name of the file * @return {Object[]} returns the list of all publications. **/ function getOldPublications(fileName) { let allPublicationsList; if (fs.existsSync(fileName) === true) { // If file exists allPublicationsList = JSON.parse( fs.readFileSync(fileName, {encoding: 'utf8'}), ); } else { // If file does not exists allPublicationsList = []; } return allPublicationsList; } /** * The callback function called getOldPublicationsCallback and * displayed as a global type * @callback getAllPublicationsCallback * @param {string} fileName The file name that we want to load * old publications from * @return {Object[]} The list of old publications loaded from * the given fileName */ /** * The callback function called autoScrollFunction and * displayed as a global type * @callback getAutoScrollFunction * @param {Page} page The page that we want to scroll * @param {sleepFunctionCallback} sleep The sleep function that * we are using for waiting before scroll * @return {Page} The scrolled page */ /** * Function handles the main execution of the Facebook bot. * @namespace facebookMain * @param {Object} arguments Command line arguments parsed with minimist * @param {string} groupUrl The url of the Facebook group * @param {Page} page The actual page of browser * @param {string} id The id of the facebook group * @param {getOldPublicationsCallback} getOldPublications The function used for loading the older publications * @param {autoScrollFunction} autoScroll The function used for scrolling automatically * @param {sleepFunctionCallback} sleep The sleep function that we use in autoScroll * @return {void} returns nothing but scrape all questions from specific groups **/ async function facebookMain( arguments, groupUrl, page, id, getOldPublications, autoScroll, sleep, ) { // Navigates to the first facebook group Türk Ögrenciler - Paris await page.goto( groupUrl, {timeout: 600000}, ); /** * Waiting for the group stories container to continue * and to avoid the selector not found error **/ // Getting all Facebook group posts const groupNameHtmlElement = (await page.$x('/html/head/title'))[0]; let groupName = await page.evaluate( (el)=> { return el.textContent; }, groupNameHtmlElement, ); if (arguments['debug'] === true) { console.log('Group title ' + groupName); } groupName = groupName.replace(/\//g, '_'); const fileName = arguments['output'] + groupName + '.json'; const allPublicationsList = getOldPublications(fileName); // List contains all publications // Variable indicates if any new posts found on the page do { if (arguments['debug'] === true) { console.log(`Total posts before scraping ${allPublicationsList.length}`); } // eslint-disable-next-line no-var var isAnyNewPosts = false; await page.waitForXPath( '//article/div[@class="story_body_container"]', ); const groupPostsHtmlElements = await page.$x( '//article/div[@class="story_body_container"]/div[1]', ); const groupPostsAuthorHtmlElemments = await page.$x( '((//article/div[@class="story_body_container"])' + '[child::div])/header//strong[1]', ); // Looping on each group post html elemen to get text and author for (let i = 0; i < groupPostsAuthorHtmlElemments.length; i++) { const [postAuthorName, postTextContent] = await page.evaluate( (el, eb) => { return [el.textContent, eb.textContent]; }, groupPostsAuthorHtmlElemments[i], groupPostsHtmlElements[i], ); await groupPostsAuthorHtmlElemments[i] .$x('//article/div[@class="story_body_container"]//span[1]/p'); // crates a publication object which contains our publication const publication = { post: postAuthorName, author: postTextContent, }; // variable indicates if publication exists in allPublicationsList let isPublicationExists = false; // Check if publication exists in allPublicationsList for (let a = 0; a<allPublicationsList.length; a++) { const otherPublication = allPublicationsList[a]; if ( (publication.post === otherPublication.post) && (publication.author === otherPublication.author) ) { // If publication exists in allPublictationList isPublicationExists = true; break; } else { // if publication does not exists in allPublictationList isPublicationExists = false; } } /** * Once we got the response from the check * publication in allPublicationsList **/ if (isPublicationExists === false) { allPublicationsList.push(publication); isAnyNewPosts = true; } } /** * All html group post elements are added on * global publictions list (allPublictionList) **/ if (arguments['debug'] === true) { console.log('Total posts before scrolling' + allPublicationsList.length); } /** * console.log(`Total posts before * scrolling ${allPublicationsList.length}`); **/ // Both console.log statement above are same await autoScroll(page, sleep); } while (isAnyNewPosts === true); console.info( groupName + ' Facebook group\'s posts scraped: ' + allPublicationsList.length + ' posts found', ); fs.writeFileSync( fileName, JSON.stringify(allPublicationsList, undefined, 4), {encoding: 'utf8'}, ); // await browser.close(); } /** * Function handles the main process of the scraper * @namespace main * @param {Object} arguments arguments parsed from command line with minimist * @param {askQuestionsFunctionCallback} askQuestionsFunction The function used for asking questions to user configuration * @param {validatorFunctionCallback} validator The function used for validate user answsers * @param {createBrowserCallback} createBrowser function that creates the browser * @param {incognitoModeCallback} incognitoMode function creates an incognito mode from the given browser * @param {setPageListenersCallback} setPageListeners function sets the page * listeners on the given page * @param {generateFacebookGroupUrlFromIdCallback} generateFacebookGroupUrlFromId function sets the page * listeners on the given page * @param {facebookMainCallback} facebookMain The main function used for scraping data from facebook * @param {getOldPublicationsCallback} getOldPublications The function used for loading old publications * @param {autoScrollCallback} autoScroll The function used for auto scrolling * @param {sleepFunctionCallback} sleep The function used for sleeping the current process * @return {void} returns nothing but calls the FacebookMain * function for each groupId once logged in **/ async function main( arguments, askQuestionsFunction, validator, createBrowser, incognitoMode, setPageListeners, generateFacebookGroupUrlFromId, facebookMain, getOldPublications, autoScroll, sleep, ) { if (isUserConfigured() === false) { await userConfig(askQuestionsFunction, validator); } const facebookGroupIdList = arguments['group-ids'].split(','); const browser = await createBrowser(arguments); let page = await incognitoMode(browser); await page.setUserAgent( 'User agent Mozilla/5.0 (Macintosh;' + ' Intel Mac OS X 10_16_0) AppleWebKit/537.36'+ ' (KHTML, like Gecko) Chrome/80.0.3987.0 Safari/537.36', ); page = await facebookLogIn(arguments, page, setPageListeners); // for (var i = 0; i < facebookGroupIdList.length; i++) { for (let i = 0; i < facebookGroupIdList.length; i++) { const id = facebookGroupIdList[i]; const groupUrl = generateFacebookGroupUrlFromId(id); await facebookMain( arguments, groupUrl, page, id, getOldPublications, autoScroll, sleep, ); } await browser.close(); } if ( fs.existsSync(arguments['output']) === false || fs.lstatSync(arguments['output']).isDirectory() === false ) { // output is not exists or not a directory error( arguments['output'] + 'does not exists or is not a directory. '+ 'Please retry with an existing directory path', ); process.exit(1); } if (arguments['help'] === true) { help(helpPageLine); process.exit(0); } if (arguments['version'] === true) { version(); process.exit(0); } // if (arguments['_'].includes('init')) { if (arguments['_'].indexOf('init') !== -1) { userConfig(askConfigQuestions, validator).then(() => { process.exit(0); }); } else { if (arguments['group-ids'] !== undefined && arguments['group-ids'] !== null) { main( arguments, askConfigQuestions, validator, createBrowser, incognitoMode, setPageListeners, generateFacebookGroupUrlFromId, facebookMain, getOldPublications, autoScroll, sleep, ).then(() => { console.log('Facebook group scraping done'); }); } else { error('No argument specified. Please check help page for valid arguments'); help(helpPageLine); process.exit(1); } }