已知某个网址http:www.***.com,扒出所有的新闻

xiaoxiao2021-02-28  19

app.js文件:

const fs = require('fs'); const request = require('superagent'); const cheerio = require('cheerio'); const mapLimit = require('async/mapLimit'); const url = 'http://www.hiynn.com/hy-zh'; let counter = 0 // const fetch = require('whatwg-fetch'); function getNews() { return new Promise((resolve, reject) => { request.get(`${url}/news.html`) .end((err, data) => { if (err) throw err let $ = cheerio.load(data.text), links = [] $('h3 > a').each((index, item) => { let $item = $(item) // href="#" || href="undefined" if ($item.attr('href').match('#') || !$item.attr('href')) return links.push($item.attr('href')) }) resolve(links) }) }) .then(links => { /** * mapLimit *@param arg[0] 需要遍历的集合 *@param arg[1] 最大并发请求数 *@param arg[2] 迭代处理函数 -> 参数1:传入集合中的每一项;参数2:回调函数 *@param arg[3] 所有迭代完成后的回调函数 -> 参数1:err;参数2:结果集 */ mapLimit(links, 10, function (link, cb) { request.get(`${url}/${link}`) .end((err, data) => { if (err) throw err let $ = cheerio.load(data.text, { xmlMode: true, decodeEntities: false, normalizeWhitespace: true, withDomLvl1: false }), news = [], create_time=[] let creat_time = link.slice(5, 11); create_time.push(creat_time.slice(0, 2), creat_time.slice(2, 4), creat_time.slice(4, 6)) let createTime = create_time.join('-'); let title = $('#tab1').html(); let Title = title.match(/<h3>(.+)<\/h3>/)[1].trim(); let typeArr = [1, 2]; let allStr = $('#tab1').html(); //匹配图片(g表示匹配所有结果i表示区分大小写) var imgReg = /<img.*?(?:>|\/>)/gi; //匹配src属性 var srcReg = /src=[\'\"]?([^\'\"]*)[\'\"]?/i; var arr = allStr.match(imgReg); console.log('arr------->', arr); let imgSrc = []; if(arr) { for (var i = 0; i < arr.length; i++) { imgSrc = arr[i].match(srcReg); } }else { imgSrc = [ '<img src="./images/news160812/1.jpg" width="100%" text-align:"center"=""/>', "./images/news161222/3.png"] } // console.log('imgSrc[0]------->', imgSrc[0]); console.log('imgSrc---->', imgSrc); news.push({ counter:counter, author:'小明', content:'', creat_time:`20${createTime}`, link: link, content_html: $('#tab1').html(), deleted_flag: 0, important: 2, title: Title, img: imgSrc[1], type: Math.ceil(Math.random() * typeArr.length) }) counter++; console.log(`${counter}/${links.length}`) // 限制每秒的请求个数 setTimeout(() => { // callback 必须调用,否则不会向下执行 cb(null, news) }, 1000) }) }, function (err, coll) { if (err) throw err // 二维数组 -> 一维数组 let news = Array.prototype.concat.apply([], coll) // 写入文件 fs.writeFile('./links.js', JSON.stringify(news), 'utf8', function (err) { if (err) throw err console.log('写入成功'); }) }) }) } getNews()

package.json

{ "name": "Crawler", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1", "start": "node app.js" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { "async": "^2.6.0", "cheerio": "^1.0.0-rc.2", "superagent": "^3.8.1", "whatwg-fetch": "^2.0.3" } }

最后生成一个links.js文件,文件内容是一个数组,包括所有新闻。

转载请注明原文地址: https://www.6miu.com/read-1600186.html

最新回复(0)