drudgereport
Version:
Polls the DrudgeReport.com website and tracks the links
238 lines (218 loc) • 7.01 kB
text/typescript
import { Log } from "fme-logger";
var L = new Log("models/drudge");
L.setLevel("info");
import * as request from "request-promise-native";
import * as url from "url";
import * as cheerio from "cheerio";
export class DrudgeArticle {
html: string;
time: Date;
location: string;
index: number;
source: string;
href: string;
latest: Date;
}
export class DrudgeFetch {
ready:boolean;
history: DrudgeArticle[];
latest : DrudgeArticle[];
constructor () {
L.debug("DrudgeFetch.constructor Starting; ===>")
this.ready = false;
this.history = [];
this.latest = [];
this.ready = true;
}
findNew() {
var newArticles:any[] = [];
return new Promise <DrudgeArticle[]>( async (resolve,reject)=> {
var articles:DrudgeArticle[] = await this.getLatest();
for (var i in articles ) {
var article = articles[i];
if ( this.isDuplicate(article) ) {
// L.info("DrudgeFetch: Duplicate found",article.href)
continue;
} else {
newArticles.push(article);
this.history.push(article);
}
}
resolve(newArticles)
});
}
isDuplicate (article:any) {
for (var i=0; i<this.history.length; i++) {
var h = this.history[i];
if (h.href == article.href) {
return true;
}
}
return false;
}
getLatest () {
return new Promise<DrudgeArticle[]>((resolve,reject)=> {
var d:any[] = [];
try {
this.get().
then (data => {
d.push(...this.parseHeadlines(<string>data));
d.push(...this.parseMainHeadline(<string>data));
d.push(...this.parseFirstColumn(<string>data));
d.push(...this.parseSecondColumn(<string>data));
d.push(...this.parseThirdColumn(<string>data));
resolve(d)
})
} catch (err) {
L.error("drudgeFetch.getLatest Error",err);
reject(err)
}
});
}
get () {
return new Promise((resolve, reject) => {
request("http://drudgereport.com",(err,res,body) =>{
if (err) {
reject(err);
} else {
resolve(body);
}
})
});
}
parseHeadlines (body:string) {
var links:any[] = [];
var $ = cheerio.load(body);
$("#app_topstories a").each(function(this:any, index:any){
var source = url.parse($(this).attr("href")).hostname
links.push(
{
href: $(this).attr("href"),
full_link: $(this).attr("href"),
html: $(this).html(),
time: new (Date),
location: "leftHeadlines",
index: index,
source: source
}
)
});
return links;
}
parseMainHeadline (body:string) {
var links:any[] = [];
var $ = cheerio.load(body);
$("#app_mainheadline a").each(function(this:any, index:any){
var source = url.parse($(this).attr("href")).hostname
links.push(
{
href: $(this).attr("href"),
full_link: $(this).attr("href"),
html: $(this).html(),
time: new (Date),
location: "mainHeadlines",
index: index,
source: source
}
)
});
return links;
}
parseFirstColumn (body:string) {
var links:any[] = [];
var $ = cheerio.load(body);
var col1 = $("#app_col1 table td")[0];
$(col1).find("a").each (function (this:any, idx:any){
var source = url.parse($(this).attr("href")).hostname
links.push(
{
href: $(this).attr("href"),
full_link: $(this).attr("href"),
html: $(this).html(),
time: new (Date),
location: "firstColumn",
index: idx,
source: source
}
)
})
//
// => filter to remove links list
//
for (var i=0; i<links.length; i++) {
var link = links[i];
if (link.html == "FRONT PAGES UK") {
i = i-1;
var length = links.length;
links.splice(i,length-i);
break;
}
}
return links;
}
parseSecondColumn (body:string) {
var links:any = [];
var $ = cheerio.load(body);
var col2 = $("#app_col2");
$(col2).find("a").each (function (this:any, idx:any){
var source = url.parse($(this).attr("href")).hostname
// L.info($(this).html())
links.push(
{
href: $(this).attr("href"),
full_link: $(this).attr("href"),
html: $(this).html(),
time: new (Date),
location: "secondColumn",
index: idx,
source: source
}
)
})
//
// => filter to remove links liet
//
for (var i=0; i<links.length; i++) {
var link = links[i];
if (link.html == "3 AM GIRLS") {
//i = i-1;
var length = links.length;
links.splice(i,length-i);
break;
}
}
return links;
}
parseThirdColumn (body:string) {
var links:any[] = [];
var $ = cheerio.load(body);
var col = $("#app_col3");
$(col).find("a").each (function (this:any, idx:any){
var source = url.parse($(this).attr("href")).hostname
links.push(
{
href: $(this).attr("href"),
full_link: $(this).attr("href"),
html: $(this).html(),
time: new (Date),
location: "thirdColumn",
index: idx,
source: source
}
)
})
//
// => filter to remove links list
//
for (var i=0; i<links.length; i++) {
var link = links[i];
if (link.html == "AGENCE FRANCE-PRESSE") {
i = i-1;
var length = links.length;
links.splice(i,length-i);
break;
}
}
return links;
}
}