urqforum-parser/urq.js

333 lines
9.7 KiB
JavaScript

import puppeteer from 'puppeteer';
import fs from 'fs';
import mysql from 'mysql2/promise';
const config = JSON.parse(fs.readFileSync('./config.json'));
export default class Urq {
constructor() {
this.dbopts = {
host: 'localhost',
database: config.DB_NAME,
user: config.DB_USER,
password: config.DB_PASSWORD,
port: '3306',
};
this.forum_id = 2;// your first category
};
async get_user_id(username) {
if (!username) {
return false;
}
let user_id = -1;
let [userrows, fields] = await this.db.execute(`
SELECT \`user_id\` FROM \`${config.DB_PREFIX}users\`
WHERE username_clean = ?;
`,[
username.toLowerCase()
]);
if (userrows.length > 0) {
user_id = userrows[0].user_id;
} else {
let temp;
temp = await this.db.execute(`
INSERT INTO \`${config.DB_PREFIX}users\`
(
\`username\`, \`username_clean\`, \`user_permissions\`,
\`user_sig\`
)
VALUES(?, ?, ?, ?);`,[
username,
username.toLowerCase(),
"",
""
]);
user_id = temp.insertId;
}
return user_id;
}
async scrape() {
this.db = await mysql.createConnection(this.dbopts);
this.browser = await puppeteer.launch({
"headless": true,
});
await this.db.execute(`
CREATE TABLE IF NOT EXISTS \`${config.DB_PREFIX}bordathreads\` (
\`id\` Int( 255 ) UNSIGNED AUTO_INCREMENT NOT NULL,
\`url\` Varchar(255) UNIQUE NOT NULL,
\`name\` Varchar(255) NOT NULL,
PRIMARY KEY ( \`id\` )
);`);
try {
let [threadrow, fields] = await this.db.execute(
`SELECT COUNT(*) FROM \`${config.DB_PREFIX}posts\``
);
} catch (e) {
console.error('Please install phpbb first.');
await this.browser.close();
return;
}
await this.scrape_pages();
await this.scrape_threads();
await this.browser.close();
};
async scrape_pages() {
try {
console.log("Scraping threads.");
const page = await this.browser.newPage();
let threads = [];
for (let offset = 0; offset <= 400; offset += 20) {
console.log("Offset "+offset);
await page.goto("http://urq.borda.ru?0-0-"+offset, {
"waitUntil" : "domcontentloaded"
});
let pagethreads = await page.evaluate(() => {
try {
let links = document.querySelectorAll(".font3 a");
let href = [];
for (let i = 0; i < links.length; i++) {
let url = "http://urq.borda.ru"+links[i].getAttribute("href");
url = url.replace("-000-0-0-", "-000-10001-0-");
href.push({
"url": url,
"name": links[i].innerHTML,
});
}
return href;
} catch (e) {
return [];
}
});
console.log(pagethreads);
for (let i = 0; i < pagethreads.length; i++) {
threads.push(pagethreads[i]);
}
}
console.log(threads);
for (let i = 0; i < threads.length; i++) {
try {
let [threadrow, fields] = await this.db.execute(`
SELECT \`id\` FROM \`${config.DB_PREFIX}bordathreads\`
WHERE \`url\` = ?
`,[
threads[i].url,
]);
if (threadrow.length === 0) {
let [rows, fields] = await this.db.execute(`
INSERT INTO ${config.DB_PREFIX}bordathreads (url, name) VALUES( ?, ? )
`, [
threads[i].url,
threads[i].name,
]);
}
} catch(e) {
console.log(e);
}
}
await page.close();
} catch(e) {
console.log(e);
}
};
/**
* Scrape all threads
*/
async scrape_threads() {
try {
console.log("Scraping threads.");
const page = await this.browser.newPage();
page.on('console', msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
let [rows, fields] = await this.db.execute(`
SELECT * FROM \`${config.DB_PREFIX}bordathreads\`
`);
for (let i = 0; i < rows.length;i++) {
await page.goto(rows[i].url, {
"waitUntil" : "domcontentloaded"
});
let messages = await page.evaluate(() => {
function intersect(a, b) {
var t;
// indexOf to loop over shorter
if (b.length > a.length) t = b, b = a, a = t;
return a.filter(function (e) {
return b.indexOf(e) > -1;
});
}
try {
let rows = document.querySelectorAll("body > table > tbody > tr > td > table > tbody > tr > td > table > tbody > tr");
let retval = [];
for (let i = 0; i < rows.length; i++) {
let msgclasses = ["mess1", "mess2", "mess3", "mess4"];
let rowclasses = [];
if(rows[i].getAttribute("class") !== null) {
rowclasses = rows[i].getAttribute("class").split(" ");
}
if (
rowclasses.length > 0 &&
intersect(msgclasses, rowclasses) === []
) {
continue;
}
let date = null;
let author = rows[i].querySelector(".font3 a b");
if (author) {
author = author.innerHTML;
}
let content = rows[i].querySelector(".font1");
if (content) {
content = content.innerHTML;
}
if (!author && !content) {
continue;
}
let meta = rows[i].querySelectorAll(".font2")[2];
let title = '';
if (meta) {
meta = meta.innerHTML;
date = meta.match(/\d\d\.\d\d\.\d\d\&nbsp\;\d\d\:\d\d/);
[undefined, title] = meta.split("Заголовок: ");
if (date) {
date = date[0];
let [days, time] = date.split("&nbsp;");
let [hour, minute] = time.split(":");
let [day,month,year] = days.split(".");
year = "20"+year;
date = new Date(year, month, day, hour, minute);
date = date.getTime() / 1000;
}
}
let avatar = rows[i].querySelectorAll(".font2 img")[1];
if (avatar) {
avatar = avatar.getAttribute("src");
if (avatar === "http://forum24.ru/gif/img/p.gif") {
avatar = null;
}
}
retval.push({
"author": author,
"title": title,
"content": content,
"avatar": avatar,
"date": date
});
}
return retval;
} catch (e) {
console.log(e.message);
return [];
}
});
let user_id = undefined;
let topic_id = undefined;
while (user_id === undefined) {
user_id = await this.get_user_id(messages[0].author);
}
if (user_id === false) {
continue;
}
while(topic_id === undefined) {
topic_id = await this.get_topic_id(user_id, messages[0].date, rows[i].name);
}
for (let j = 0; j < messages.length; j++) {
if (messages[j].author == undefined ||
messages[j].date == undefined ||
messages[j].title == undefined ||
messages[j].content == undefined
) {
console.log("Skipped message");
console.log(messages[j]);
continue;
}
user_id = undefined;
while (user_id === undefined) {
user_id = await this.get_user_id(messages[j].author);
}
await this.insert_post(
topic_id,
user_id,
messages[j].date,
messages[j].title,
messages[j].content
);
}
}
} catch(e) {
console.log(e);
}
}
async insert_post(topic_id, user_id, date, title, content) {
let [userrows, fields] = await this.db.execute(`
SELECT \`post_id\` FROM \`${config.DB_PREFIX}posts\`
WHERE \`topic_id\` = ?
AND \`poster_id\` = ?
AND \`post_time\` = ?
AND \`post_subject\` = ?;
`,[
topic_id,
user_id,
date,
title
]);
if (userrows.length === 0) {
await this.db.execute(`
INSERT INTO \`${config.DB_PREFIX}posts\`
(
\`topic_id\`, \`forum_id\`, \`poster_id\`,
\`post_time\`, \`post_subject\`, \`post_text\`,
\`post_visibility\`
)
VALUES(?, ?, ?, ?, ?, ?, ?);`,[
topic_id,
this.forum_id,
user_id,
date,
title,
content,
"1"
]);
}
}
async get_topic_id(user_id, date, topic_name) {
user_id = parseInt(user_id);
let [userrows, fields] = await this.db.execute(`
SELECT \`topic_id\` FROM \`${config.DB_PREFIX}topics\`
WHERE \`topic_title\` = ?
AND \`topic_poster\` = ?;
`,[
topic_name, user_id
]);
if (userrows.length > 0) {
return userrows[0].topic_id;
} else {
let temp;
temp = await this.db.execute(`
INSERT INTO \`${config.DB_PREFIX}topics\`
(\`forum_id\`, \`topic_poster\`, \`topic_time\`, \`topic_title\`,
\`topic_visibility\`)
VALUES(?, ?, ?, ?, ?);`,[
this.forum_id,
user_id,
date,
topic_name,
"1"
]);
return temp.insertId;
}
}
}