本文共 3894 字,大约阅读时间需要 12 分钟。
使用 puppeteer 爬取古诗文网 内容及音频,通过 mysqljs 保存到数据库
核心代码如下:
'use strict';const puppeteer = require('puppeteer');var request = require('request');var fs = require('fs');const gushiwen = require('./sql');(async () => { const browser = await puppeteer.launch({ // headless: false }); for(let i=1 ; i < 999; i++){ let url = 'https://www.gushiwen.org/shiwen/default_0AA'+i+'.aspx'; this.page = await browser.newPage(); //添加await降低请求的速度,避免影响对端服务器 await click(this.page,url); // click(this.page,url); }})();async function click(page,url){ // let page = this.page; await page.goto(url); let mp3IDList = await page.$$('img[id*="speakerimg"]') // console.debug(mp3IDList) for(let i=0 ; i < mp3IDList.length; i++){ await mp3IDList[i].click(); } let cont = await page.$$('.left .sons') // console.debug(cont); for(let i=0 ; i < cont.length; i++){ // 获取标题 let title = await cont[i].$eval('.sons .cont a', el => el.innerText); console.debug(title); // 获取作者和朝代 let source = await cont[i].$eval('.source', el => el.innerText); // console.debug(source); let dynasty = source.split(":")[0] let author = source.split(":")[1] // console.debug(dynasty); // console.debug(author); // 获取内容ID let id = (await cont[i].$eval('.contson', el => el.id)).substring(7) // console.debug(id); // 获取内容 let contson = await cont[i].$eval('.contson', el => el.innerHTML); // console.debug(contson); // 获取tag let tag = await cont[i].$eval('.tag', el => el.innerText).catch(function (err){ console.error(err); }); if (tag === undefined){ tag = null }else{ tag = tag.replace(/[\r\n]/g,"").replace(/,/g,",") } // console.debug(tag) // 获取点赞数 let scores = (await cont[i].$eval('.good', el => el.innerText)).trim(); // console.debug(scores); // 获取音频地址 let audiosrc = await cont[i].$eval('audio', el => el.src).catch(function (err){ console.error(err); }); // console.debug(audiosrc) let filename = '' if (audiosrc === undefined){ audiosrc = null }else{ // 下载文件 filename = './mp3/' + audiosrc.split('/')[5] await downloadFile(audiosrc,filename,function(){ console.debug(filename+'下载完毕'); }); } gushiwen.insertGushiwen(id,title,author,contson,dynasty,filename,scores,tag) }}/** url 网络文件地址* filename 文件名* callback 回调函数*/function downloadFile(url,filename,callback){ fs.open(filename, 'wx', (err, fd) => { if (err) { if (err.code === 'EEXIST') { console.error(filename + ' already exists'); return; } throw err; } // console.debug('downloading'); let stream = fs.createWriteStream(filename); try { request(url).pipe(stream).on('close', callback); } catch (err) { console.error(err); } });}
使用 mysqljs 数据库插入操作:
var mysql = require('mysql');var connection = mysql.createConnection({ host : 'localhost', user : 'root', password : 'root', database : 'nichuiniu'});module.exports = { insertGushiwen: function(num, title,author,content,dynasty,audiourl,scores,tag){ console.log('insert Gushiwen into tables') let sql = {num: num, title: title,author:author,content: content, dynasty: dynasty,audiourl:audiourl,scores: scores, tag: tag}; connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', sql, function(err, results, fields){ if (err) throw err; console.log('The affect row is: ' + results.insertId); } ); }}// 单独测试使用的方法// connection.connect();// var post = {num: 2, title: 'title',author:'author',content: 'content', // dynasty: 'dynasty',audiourl:'audiourl',scores: 1, tag: 'title'};// connection.query('INSERT ignore INTO tbl_nichuiniu_gushiwen SET ?', post, function (error, results, fields) {// if (error) throw error;// console.log('The solution is: ' + results.insertId);// });// connection.end();
GitHub地址:
转载地址:http://msbws.baihongyu.com/