import puppeteer from 'puppeteer'; import fs from 'fs'; import mysql from 'mysql2/promise'; const config = JSON.parse(fs.readFileSync('./config.json')); export default class Urq { constructor() { this.dbopts = { host: 'localhost', database: config.DB_NAME, user: config.DB_USER, password: config.DB_PASSWORD, port: '3306', }; this.forum_id = 2;// your first category }; async get_user_id(username) { if (!username) { return false; } let user_id = -1; let [userrows, fields] = await this.db.execute(` SELECT \`user_id\` FROM \`${config.DB_PREFIX}users\` WHERE username_clean = ?; `,[ username.toLowerCase() ]); if (userrows.length > 0) { user_id = userrows[0].user_id; } else { let temp; temp = await this.db.execute(` INSERT INTO \`${config.DB_PREFIX}users\` ( \`username\`, \`username_clean\`, \`user_permissions\`, \`user_sig\` ) VALUES(?, ?, ?, ?);`,[ username, username.toLowerCase(), "", "" ]); user_id = temp.insertId; } return user_id; } async scrape() { this.db = await mysql.createConnection(this.dbopts); this.browser = await puppeteer.launch({ "headless": true, }); await this.db.execute(` CREATE TABLE IF NOT EXISTS \`${config.DB_PREFIX}bordathreads\` ( \`id\` Int( 255 ) UNSIGNED AUTO_INCREMENT NOT NULL, \`url\` Varchar(255) UNIQUE NOT NULL, \`name\` Varchar(255) NOT NULL, PRIMARY KEY ( \`id\` ) );`); try { let [threadrow, fields] = await this.db.execute( `SELECT COUNT(*) FROM \`${config.DB_PREFIX}posts\`` ); } catch (e) { console.error('Please install phpbb first.'); await this.browser.close(); return; } await this.scrape_pages(); await this.scrape_threads(); await this.browser.close(); }; async scrape_pages() { try { console.log("Scraping threads."); const page = await this.browser.newPage(); let threads = []; for (let offset = 0; offset <= 400; offset += 20) { console.log("Offset "+offset); await page.goto("http://urq.borda.ru?0-0-"+offset, { "waitUntil" : "domcontentloaded" }); let pagethreads = await page.evaluate(() => { try { let links = document.querySelectorAll(".font3 a"); let href = []; for (let i = 0; i < links.length; i++) { let url = "http://urq.borda.ru"+links[i].getAttribute("href"); url = url.replace("-000-0-0-", "-000-10001-0-"); href.push({ "url": url, "name": links[i].innerHTML, }); } return href; } catch (e) { return []; } }); console.log(pagethreads); for (let i = 0; i < pagethreads.length; i++) { threads.push(pagethreads[i]); } } console.log(threads); for (let i = 0; i < threads.length; i++) { try { let [threadrow, fields] = await this.db.execute(` SELECT \`id\` FROM \`${config.DB_PREFIX}bordathreads\` WHERE \`url\` = ? `,[ threads[i].url, ]); if (threadrow.length === 0) { let [rows, fields] = await this.db.execute(` INSERT INTO ${config.DB_PREFIX}bordathreads (url, name) VALUES( ?, ? ) `, [ threads[i].url, threads[i].name, ]); } } catch(e) { console.log(e); } } await page.close(); } catch(e) { console.log(e); } }; /** * Scrape all threads */ async scrape_threads() { try { console.log("Scraping threads."); const page = await this.browser.newPage(); page.on('console', msg => { for (let i = 0; i < msg.args().length; ++i) console.log(`${i}: ${msg.args()[i]}`); }); let [rows, fields] = await this.db.execute(` SELECT * FROM \`${config.DB_PREFIX}bordathreads\` `); for (let i = 0; i < rows.length;i++) { await page.goto(rows[i].url, { "waitUntil" : "domcontentloaded" }); let messages = await page.evaluate(() => { function intersect(a, b) { var t; // indexOf to loop over shorter if (b.length > a.length) t = b, b = a, a = t; return a.filter(function (e) { return b.indexOf(e) > -1; }); } try { let rows = document.querySelectorAll("body > table > tbody > tr > td > table > tbody > tr > td > table > tbody > tr"); let retval = []; for (let i = 0; i < rows.length; i++) { let msgclasses = ["mess1", "mess2", "mess3", "mess4"]; let rowclasses = []; if(rows[i].getAttribute("class") !== null) { rowclasses = rows[i].getAttribute("class").split(" "); } if ( rowclasses.length > 0 && intersect(msgclasses, rowclasses) === [] ) { continue; } let date = null; let author = rows[i].querySelector(".font3 a b"); if (author) { author = author.innerHTML; } let content = rows[i].querySelector(".font1"); if (content) { content = content.innerHTML; } if (!author && !content) { continue; } let meta = rows[i].querySelectorAll(".font2")[2]; let title = ''; if (meta) { meta = meta.innerHTML; date = meta.match(/\d\d\.\d\d\.\d\d\ \;\d\d\:\d\d/); [undefined, title] = meta.split("Заголовок: "); if (date) { date = date[0]; let [days, time] = date.split(" "); let [hour, minute] = time.split(":"); let [day,month,year] = days.split("."); year = "20"+year; date = new Date(year, month, day, hour, minute); date = date.getTime() / 1000; } } let avatar = rows[i].querySelectorAll(".font2 img")[1]; if (avatar) { avatar = avatar.getAttribute("src"); if (avatar === "http://forum24.ru/gif/img/p.gif") { avatar = null; } } retval.push({ "author": author, "title": title, "content": content, "avatar": avatar, "date": date }); } return retval; } catch (e) { console.log(e.message); return []; } }); let user_id = undefined; let topic_id = undefined; while (user_id === undefined) { user_id = await this.get_user_id(messages[0].author); } if (user_id === false) { continue; } while(topic_id === undefined) { topic_id = await this.get_topic_id(user_id, messages[0].date, rows[i].name); } for (let j = 0; j < messages.length; j++) { if (messages[j].author == undefined || messages[j].date == undefined || messages[j].title == undefined || messages[j].content == undefined ) { console.log("Skipped message"); console.log(messages[j]); continue; } user_id = undefined; while (user_id === undefined) { user_id = await this.get_user_id(messages[j].author); } await this.insert_post( topic_id, user_id, messages[j].date, messages[j].title, messages[j].content ); } } } catch(e) { console.log(e); } } async insert_post(topic_id, user_id, date, title, content) { let [userrows, fields] = await this.db.execute(` SELECT \`post_id\` FROM \`${config.DB_PREFIX}posts\` WHERE \`topic_id\` = ? AND \`poster_id\` = ? AND \`post_time\` = ? AND \`post_subject\` = ?; `,[ topic_id, user_id, date, title ]); if (userrows.length === 0) { await this.db.execute(` INSERT INTO \`${config.DB_PREFIX}posts\` ( \`topic_id\`, \`forum_id\`, \`poster_id\`, \`post_time\`, \`post_subject\`, \`post_text\`, \`post_visibility\` ) VALUES(?, ?, ?, ?, ?, ?, ?);`,[ topic_id, this.forum_id, user_id, date, title, content, "1" ]); } } async get_topic_id(user_id, date, topic_name) { user_id = parseInt(user_id); let [userrows, fields] = await this.db.execute(` SELECT \`topic_id\` FROM \`${config.DB_PREFIX}topics\` WHERE \`topic_title\` = ? AND \`topic_poster\` = ?; `,[ topic_name, user_id ]); if (userrows.length > 0) { return userrows[0].topic_id; } else { let temp; temp = await this.db.execute(` INSERT INTO \`${config.DB_PREFIX}topics\` (\`forum_id\`, \`topic_poster\`, \`topic_time\`, \`topic_title\`, \`topic_visibility\`) VALUES(?, ?, ?, ?, ?);`,[ this.forum_id, user_id, date, topic_name, "1" ]); return temp.insertId; } } }