2018-04-09 17:41:19 +03:00
|
|
|
const puppeteer = require('puppeteer');
|
|
|
|
const mysql = require('mysql2/promise');
|
|
|
|
const config = require("./config.json");
|
|
|
|
|
|
|
|
async function asyncForEach (array, callback, args = []) {
|
|
|
|
for (let index = 0; index < array.length; index++) {
|
|
|
|
await callback(array[index], index, args)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
class Urq {
|
|
|
|
constructor() {
|
|
|
|
this.dbopts = {
|
|
|
|
host: 'localhost',
|
|
|
|
database: config.DB_NAME,
|
|
|
|
user: config.DB_USER,
|
|
|
|
password: config.DB_PASSWORD,
|
|
|
|
protocol: 'mysql',
|
|
|
|
port: '3306',
|
|
|
|
query: {pool: true}
|
|
|
|
};
|
2018-04-11 16:39:29 +03:00
|
|
|
this.forum_id = 2;// your first category
|
2018-04-09 17:41:19 +03:00
|
|
|
};
|
|
|
|
|
2018-04-12 10:19:08 +03:00
|
|
|
async get_user_id(username) {
|
|
|
|
let user_id = -1;
|
|
|
|
let [userrows, fields] = await this.db.execute(`
|
|
|
|
SELECT \`user_id\` FROM \`${config.DB_PREFIX}users\`
|
|
|
|
WHERE username = "${username}";
|
|
|
|
`);
|
|
|
|
if (userrows.length > 0) {
|
|
|
|
user_id = userrows[0].user_id;
|
|
|
|
} else {
|
|
|
|
temp = await this.db.execute(`
|
|
|
|
INSERT INTO \`${config.DB_PREFIX}users\`
|
|
|
|
(
|
|
|
|
\`username\`, \`username_clean\`, \`user_permissions\`,
|
|
|
|
\`user_sig\`
|
|
|
|
)
|
|
|
|
VALUES(?, ?, ?, ?);`,[
|
|
|
|
username,
|
|
|
|
username.toLowerCase(),
|
|
|
|
"",
|
|
|
|
""
|
|
|
|
]);
|
|
|
|
user_id = temp.insertId;
|
|
|
|
}
|
|
|
|
return user_id;
|
|
|
|
}
|
2018-04-09 17:41:19 +03:00
|
|
|
async scrape() {
|
|
|
|
this.db = await mysql.createConnection(this.dbopts);
|
|
|
|
this.browser = await puppeteer.launch({
|
|
|
|
"headless": true,
|
|
|
|
"args": [
|
|
|
|
"--disable-web-security",
|
|
|
|
"--no-sandbox",
|
|
|
|
"--disable-dev-shm-usage"
|
|
|
|
]
|
|
|
|
});
|
|
|
|
await this.db.execute(`
|
2018-04-12 10:19:08 +03:00
|
|
|
CREATE TABLE IF NOT EXISTS \`${config.DB_PREFIX}bordathreads\` (
|
|
|
|
\`id\` Int( 255 ) UNSIGNED AUTO_INCREMENT NOT NULL,
|
|
|
|
\`url\` Varchar(255) UNIQUE NOT NULL,
|
|
|
|
\`name\` Varchar(255) NOT NULL,
|
|
|
|
PRIMARY KEY ( \`id\` )
|
|
|
|
);`);
|
2018-04-10 13:41:34 +03:00
|
|
|
// await this.scrape_pages();
|
|
|
|
await this.scrape_threads();
|
2018-04-09 17:41:19 +03:00
|
|
|
await this.browser.close();
|
|
|
|
};
|
|
|
|
|
|
|
|
async scrape_pages() {
|
|
|
|
try {
|
|
|
|
console.log("Scraping threads.");
|
|
|
|
const page = await this.browser.newPage();
|
|
|
|
let threads = [];
|
|
|
|
for (let offset = 0; offset <= 400; offset += 20) {
|
|
|
|
console.log("Offset "+offset);
|
|
|
|
await page.goto("http://urq.borda.ru?0-0-"+offset, {
|
2018-04-10 13:36:08 +03:00
|
|
|
"waitUntil" : "domcontentloaded"
|
2018-04-09 17:41:19 +03:00
|
|
|
});
|
|
|
|
|
|
|
|
let pagethreads = await page.evaluate(() => {
|
|
|
|
try {
|
|
|
|
let links = document.querySelectorAll(".font3 a");
|
|
|
|
let href = [];
|
|
|
|
for (let i = 0; i < links.length; i++) {
|
2018-04-10 13:41:34 +03:00
|
|
|
let url = "http://urq.borda.ru"+links[i].getAttribute("href");
|
|
|
|
url = url.replace("-000-0-0-", "-000-10001-0-");
|
2018-04-09 17:41:19 +03:00
|
|
|
href.push({
|
2018-04-10 13:41:34 +03:00
|
|
|
"url": url,
|
2018-04-09 17:41:19 +03:00
|
|
|
"name": links[i].innerHTML,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
return href;
|
|
|
|
} catch (e) {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
});
|
|
|
|
console.log(pagethreads);
|
|
|
|
for (let i = 0; i < pagethreads.length; i++) {
|
|
|
|
threads.push(pagethreads[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-10 13:36:08 +03:00
|
|
|
console.log(threads);
|
|
|
|
|
2018-04-09 17:41:19 +03:00
|
|
|
for (let i = 0; i < threads.length; i++) {
|
|
|
|
try {
|
|
|
|
let [rows, fields] = await this.db.execute(`
|
|
|
|
INSERT INTO ${config.DB_PREFIX}bordathreads (url, name) VALUES( ?, ? )
|
2018-04-12 10:19:08 +03:00
|
|
|
`, [
|
|
|
|
threads[i].url,
|
|
|
|
threads[i].name,
|
|
|
|
]);
|
2018-04-09 17:41:19 +03:00
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
await page.close();
|
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
}
|
|
|
|
};
|
2018-04-10 15:07:33 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Scrape all threads
|
|
|
|
*/
|
|
|
|
async scrape_threads() {
|
|
|
|
try {
|
|
|
|
console.log("Scraping threads.");
|
|
|
|
const page = await this.browser.newPage();
|
2018-04-10 17:48:55 +03:00
|
|
|
page.on('console', msg => {
|
|
|
|
for (let i = 0; i < msg.args().length; ++i)
|
|
|
|
console.log(`${i}: ${msg.args()[i]}`);
|
|
|
|
});
|
2018-04-11 16:39:29 +03:00
|
|
|
let [rows, fields] = await this.db.execute(`
|
|
|
|
SELECT * FROM \`${config.DB_PREFIX}bordathreads\`
|
2018-04-12 10:19:08 +03:00
|
|
|
`);
|
2018-04-10 15:07:33 +03:00
|
|
|
for (let i = 0; i <= rows.length;i++) {
|
|
|
|
await page.goto(rows[i].url, {
|
|
|
|
"waitUntil" : "domcontentloaded"
|
|
|
|
});
|
|
|
|
let messages = await page.evaluate(() => {
|
|
|
|
function intersect(a, b) {
|
|
|
|
var t;
|
|
|
|
// indexOf to loop over shorter
|
|
|
|
if (b.length > a.length) t = b, b = a, a = t;
|
|
|
|
return a.filter(function (e) {
|
|
|
|
return b.indexOf(e) > -1;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
let rows = document.querySelectorAll("body > table > tbody > tr > td > table > tbody > tr > td > table > tbody > tr");
|
|
|
|
let retval = [];
|
|
|
|
for (let i = 0; i < rows.length; i++) {
|
|
|
|
let msgclasses = ["mess1", "mess2", "mess3", "mess4"];
|
2018-04-10 17:48:55 +03:00
|
|
|
let rowclasses = [];
|
|
|
|
if(rows[i].getAttribute("class") !== null) {
|
|
|
|
rowclasses = rows[i].getAttribute("class").split(" ");
|
|
|
|
}
|
2018-04-10 15:07:33 +03:00
|
|
|
if (
|
2018-04-10 17:48:55 +03:00
|
|
|
rowclasses.length > 0 &&
|
2018-04-10 15:07:33 +03:00
|
|
|
intersect(msgclasses, rowclasses) === []
|
|
|
|
) {
|
|
|
|
continue;
|
|
|
|
}
|
2018-04-11 16:39:29 +03:00
|
|
|
let date = null;
|
2018-04-11 12:58:35 +03:00
|
|
|
let author = rows[i].querySelector(".font3 a b");
|
|
|
|
if (author) {
|
|
|
|
author = author.innerHTML;
|
|
|
|
}
|
2018-04-12 13:50:04 +03:00
|
|
|
let content = rows[i].querySelector(".font1");
|
|
|
|
if (content) {
|
|
|
|
content = content.innerHTML;
|
|
|
|
}
|
|
|
|
if (!author && !content) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
let meta = rows[i].querySelectorAll(".font2")[2];
|
|
|
|
let title = '';
|
2018-04-11 12:58:35 +03:00
|
|
|
if (meta) {
|
|
|
|
meta = meta.innerHTML;
|
2018-04-12 13:50:04 +03:00
|
|
|
date = meta.match(/\d\d\.\d\d\.\d\d\ \;\d\d\:\d\d/);
|
|
|
|
[undefined, title] = meta.split("Заголовок: ");
|
2018-04-11 12:58:35 +03:00
|
|
|
if (date) {
|
|
|
|
date = date[0];
|
2018-04-12 13:50:04 +03:00
|
|
|
let [days, time] = date.split(" ");
|
|
|
|
let [hour, minute] = time.split(":");
|
|
|
|
let [day,month,year] = days.split(".");
|
|
|
|
year = "20"+year;
|
|
|
|
date = new Date(year, month, day, hour, minute);
|
|
|
|
date = date.getTime() / 1000;
|
|
|
|
console.log(date);
|
2018-04-11 12:58:35 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
let avatar = rows[i].querySelectorAll(".font2 img")[1];
|
|
|
|
if (avatar) {
|
|
|
|
avatar = avatar.getAttribute("src");
|
|
|
|
if (avatar === "http://forum24.ru/gif/img/p.gif") {
|
2018-04-11 16:39:29 +03:00
|
|
|
avatar = null;
|
2018-04-11 12:58:35 +03:00
|
|
|
}
|
|
|
|
}
|
2018-04-12 13:50:04 +03:00
|
|
|
retval.push({
|
|
|
|
"author": author,
|
|
|
|
"title": title,
|
|
|
|
"content": content,
|
|
|
|
"avatar": avatar,
|
|
|
|
"date": date
|
|
|
|
});
|
2018-04-10 15:07:33 +03:00
|
|
|
}
|
|
|
|
return retval;
|
|
|
|
} catch (e) {
|
2018-04-11 12:58:35 +03:00
|
|
|
console.log(e.message);
|
2018-04-10 15:07:33 +03:00
|
|
|
return [];
|
|
|
|
}
|
|
|
|
});
|
2018-04-12 13:50:04 +03:00
|
|
|
//console.log(messages);
|
2018-04-11 16:39:29 +03:00
|
|
|
|
|
|
|
let user_id = -1;
|
|
|
|
let topic_id = -1;
|
|
|
|
let temp;
|
2018-04-12 10:19:08 +03:00
|
|
|
user_id = await this.get_user_id(messages[0].author);
|
2018-04-11 16:39:29 +03:00
|
|
|
temp = await this.db.execute(`
|
|
|
|
INSERT INTO \`${config.DB_PREFIX}topics\`
|
|
|
|
(\`forum_id\`, \`topic_poster\`, \`topic_time\`, \`topic_title\`)
|
|
|
|
VALUES(?, ?, ?, ?);`,[
|
|
|
|
this.forum_id,
|
|
|
|
user_id,
|
|
|
|
messages[0].date,
|
|
|
|
rows[i].name
|
|
|
|
]);
|
|
|
|
topic_id = temp.insertId;
|
|
|
|
for (let j = 0; j <= messages.length;j++) {
|
2018-04-12 10:19:08 +03:00
|
|
|
user_id = await this.get_user_id(messages[j].author);
|
2018-04-11 16:39:29 +03:00
|
|
|
await this.db.execute(`
|
|
|
|
INSERT INTO \`${config.DB_PREFIX}posts\`
|
|
|
|
(
|
|
|
|
\`topic_id\`, \`forum_id\`, \`poster_id\`,
|
|
|
|
\`post_time\`, \`post_subject\`, \`post_text\`
|
|
|
|
)
|
|
|
|
VALUES(?, ?, ?, ?);`,[
|
|
|
|
topic_id,
|
|
|
|
this.forum_id,
|
|
|
|
user_id,
|
|
|
|
messages[j].date,
|
|
|
|
messages[j].title,
|
|
|
|
messages[j].content
|
|
|
|
]);
|
|
|
|
}
|
2018-04-10 15:07:33 +03:00
|
|
|
}
|
2018-04-11 16:39:29 +03:00
|
|
|
// DEBUG - only first page
|
|
|
|
return;
|
2018-04-10 15:07:33 +03:00
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-09 17:41:19 +03:00
|
|
|
}
|
|
|
|
module.exports = Urq;
|