这里使用nodejs下的chrome-har库来导出浏览器的har数据,经验证效果不错,比较靠谱。
1,创建日志配置(ultra-harlog/module/log.js)
//cnpm install --save log4js
const log4js = require('log4js');
const options = {
appenders:{
console:{
type: "console"
},
"puppeteer-record":{
type : 'dateFile',
filename : 'logs/puppeteer/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
},
"puppeteer-har-record":{
type : 'dateFile',
filename : 'logs/puppeteerhar/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
},
"puppeteer-harevent-record":{
type : 'dateFile',
filename : 'logs/puppeteerharevent/log',
pattern : '-yyyy-MM-dd.log',
alwaysIncludePattern : true,
encoding : 'utf-8'
}
} ,
"categories": {
"default": { "appenders": ['console', "puppeteer-record", "puppeteer-har-record","puppeteer-harevent-record"], "level": "all" }
}
}
log4js.configure(options);
function getConsoleLogger(){
let consoleLog = log4js.getLogger('console');
return consoleLog ;
}
function getPuppeteerRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-record');
return consoleLog ;
}
function getPuppeteerHarRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-har-record');
return consoleLog ;
}
function getPuppeteerHarEventRecordLogger(){
let consoleLog = log4js.getLogger('puppeteer-harevent-record');
return consoleLog ;
}
exports.getConsoleLogger = getConsoleLogger;
exports.getPuppeteerRecordLogger = getPuppeteerRecordLogger;
exports.getPuppeteerHarRecordLogger = getPuppeteerHarRecordLogger;
exports.getPuppeteerHarEventRecordLogger = getPuppeteerHarEventRecordLogger;
创建抓取的代码(harlog/module/puppeteerhar.js)
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
const path = require("path");
const logger=require("./log");
const grpcclient=require("./grpcclient");
const log = logger.getPuppeteerHarRecordLogger() ;
/*
启动浏览器
*/
async function launchBrowser(){
//启动浏览器实例 [puppeteer.createBrowserFetcher([options])]
let browser = await puppeteer.launch({
// 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/
//executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',
//如果是访问https页面 此属性会忽略https错误
ignoreHTTPSErrors: true,
// 关闭headless模式, 不会打开浏览器
headless: true,
//浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/ --timeout
args:['--disk-cache-size=0','--disable-cache','--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],
//是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。
devtools: false,
//Defaults to 30000 (30 seconds). Pass 0 to disable timeout.
timeout: 0
//放慢puppeteer执行的动作,方便调试
//slowMo: 250
});
return browser ;
}
async function saveHarlog(url,dirPath,filename){
let homesite = url ;
//保存的文件路径
let harFilePath = path.join(dirPath,filename) ;
//处理URL
if(!(url.startsWith('http://') || url.startsWith('https://'))){
url = "http://" + url ;
}
//打开浏览器
let browser = await launchBrowser() ;
//Puppeteer 初始化的屏幕大小默认为 800px x 600px。但是这个尺寸可以通过 Page.setViewport() 设置。
/*
await page.setViewport({
width: 800,
height: 600
});
*/
//创建一个新页面
//let page = await browser.newPage();
const page = (await browser.pages())[0];
await page.waitFor(1000); //delay 1 s
//page.setDefaultTimeout(12000);
//page.setJavaScriptEnabled(enabled)
//事件监听轻松打出页面的log
//page.on('console', msg => log.info('PAGE LOG:', msg.text()));
let har = new PuppeteerHar(page);
try{
await har.start({ path:harFilePath});
/*
页面跳转相关函数:
page.goto(url, options) //相当于在浏览器中输入了地址,然后回车
page.goBack(options)
page.goForward(options)
page.reload(options)
*/
await page.goto(url,{
timeout:0
});
log.info(page.mainFrame().title());
log.info(page.mainFrame().url());
//返回HTML文档内容
//const html = await page.$eval('html', e => e.outerHTML);
//const html = await page.content() ;
//通知JAVA解析HAR文件
/*
try{
grpcclient.resovleHarLog({
url:homesite,
file_name:filename,
file_dir:dirPath,
context:''
});
}catch(err){
log.error('发送RPC请求失败,' + err);
}
*/
}catch(error){
log.info('resovle error :' + url + "; error message:" + error) ;
}finally{
if(har){
await har.stop();
}
if(browser){
await browser.close();
}
}
}
exports.launchBrowser = launchBrowser;
exports.saveHarlog = saveHarlog;
创建启动文件(ultra-harlog/puppeteerhar-app.js)
const fs = require("fs");
const path = require("path");
const moment = require("moment");
const schedule = require('node-schedule');
const cvsresovler=require("./module/cvsresovle");
const mhar=require("./module/puppeteerhar");
/*
cnpm install --save moment
cnpm install --save csv
cnpm install --save node-schedule
cnpm install --save puppeteer
cnpm install --save puppeteer-har
cnpm install --save iconv-lite
cnpm install --save chrome-har
cnpm install --save grpc
*/
function init(){
console.log('初始化调度器') ;
//每分钟的第30秒定时执行一次:
schedule.scheduleJob('0 14 10 * * *',()=>{
let ftime = moment().format('YYYYMMDDHHmm');
console.log('当前调度时间为:' + ftime) ;
let dirPath = path.join(__dirname,'harlogs',ftime) ;
console.log("创建目录:" + dirPath) ;
let isExist = false ;
if(fs.existsSync(dirPath)){
//创建文件夹
let stat = fs.lstatSync(dirPath);
if(stat.isDirectory()){
isExist = true ;
}
}
if(!isExist){
//创建文件夹
console.log("创建文件夹" + ftime) ;
fs.mkdirSync(dirPath);
}
//开始解析需要处理的URL
let dataArr = cvsresovler.readUrlRecord(path.join(__dirname,'top300.csv')) ;
console.log("解析出URL共计" + dataArr.length + "条") ;
/*
开始抓取HAR数据【同步的方式执行】。
注意:如果这里直接通过for循环遍历dataArr并调用saveHarlog方法,那么这将是一个异步的过程。
*/
(async function iterator(i){
let data = dataArr[i]
let url = data['SITE_LINK'] ;
url = url.trim() ;
let filename = url.replace(////g,'-').replace(////g,'-') + '.har' ;
if(url){
console.log((i+1) + "-starting to resovle url :" + url ) ;
try{
await mhar.saveHarlog(url,dirPath,"N" + "-" + filename) ;
}catch(error){
console.log(error) ;
}
}
if(i + 1 < dataArr.length){
iterator(i+1) ;
}
})(0) ;
});
console.log('应用程序启动完成') ;
}
//执行
//init();
/**
用于测试的方法
*/
async function test(){
let ftime = moment().format('YYYYMMDDHHmm');
console.log('当前执行时间为:' + ftime) ;
let dirPath = path.join(__dirname,'harlogs',ftime) ;
console.log("创建目录:" + dirPath) ;
let isExist = false ;
if(fs.existsSync(dirPath)){
//创建文件夹
let stat = fs.lstatSync(dirPath);
if(stat.isDirectory()){
isExist = true ;
}
}
if(!isExist){
//创建文件夹
console.log("创建文件夹" + ftime) ;
fs.mkdirSync(dirPath);
}
//测试的URL
let url = "www.baidu.com" ;
let arguments = process.argv.splice(2);
if(arguments.length > 0 ){
url = arguments[0] ;
}
url = url.trim() ;
let filename = url.replace(////g,'-').replace(////g,'-') + '.har' ;
if(url){
console.log("starting to resovle test url :" + url ) ;
try{
await mhar.saveHarlog(url,dirPath,"NT" + "-" + filename) ;
}catch(error){
console.log(error) ;
}
}
}
//运行测试
test() ;
关于GRPC部分的代码,请参考我另外一篇博文
参考地址:https://michaljanaszek.com/blog/generate-har-with-puppeteer
原创文章,作者:carmelaweatherly,如若转载,请注明出处:https://blog.ytso.com/196817.html