使用phpquery采集小说

xiaoxiao2021-02-28 146

使用phpquery采集小说

说明

phpQuery是基于php5新添加的DOMDocument。而DOMDocument则是专门用来处理html/xml。它提供了强大xpath选择器及其他很多html/xml操作函数，使得处理html/xml起来非常方便

步骤

先获取小说的列表页的a标签的链接进入详情页获取标题和内容输出txt到浏览器

代码地址

代码

<?php /** * Created by PhpStorm. * User: rex * Date: 2017/7/6 0006 * Time: 上午 11:32 */ include 'phpquery/phpQuery/phpQuery.php'; class Collection{ private $url; //列表页链接 private $sonUrl;//详情页链接 private $list;//列表页的a链接所在位置 private $titleTag;//详情页的标题标签 private $conTag;//详情页内容所在标签 private $br;//详情页的换行符 private $delete; public function init(){ $_GET['url'] ="http://www.bixia.org/59_59857/"; $_GET['sonUrl'] = "http://www.bixia.org/59_59857"; $_GET['list'] = ".box_con dl dd a"; $_GET['titleTag'] = "h1"; $_GET['conTag'] = "#content"; $_GET['br'] = "<br>" ; $this->setVar();//处理参数 header("Content-type: text/html; charset=utf-8"); ini_set('date.timezone','Asia/Shanghai'); set_time_limit(0); $this->coll();//开始采集 } private function coll(){ $mxUrl = $this->sonUrl; //采集小说章节链接 phpQuery::newDocumentFile($this->url); //抓取网址 //取出页面所有链接排序 $arr=pq($this->list); //pq类似于jquery的选择器$()，这里找到class为postTitle的元素 $list = array(); foreach($arr as $li){ $list[] = pq($li)->attr('href'); } // sort($list);//对a标签排序 $result = ''; foreach($list as $k=>$v){//采集详情页 $html = file_get_contents($mxUrl.$v); //处理编码为空的情况 if( strpos($html,'charset="')){ $html = iconv("gb2312", "utf-8//IGNORE",$html); } //创建phpquery对象 phpQuery::newDocument($html); //获取title $title = pq($this->titleTag)->text(); //获取内容 $con = pq($this->conTag)->html(); //替换html换行为txt文本换行 $con = str_replace($this->br,"\r\n",$con); $result .= $title."\r\n"; $result .= $con."\r\n"; echo $title.'组装完成<br>'; } //把txt文本输出txt下载 $filename=time().'.txt';//要导出的文件的文件名需要加上文件后缀 header('Content-Type: text/x-sql'); header('Expires: ' . gmdate('D, d M Y H:i:s') . ' GMT'); header('Content-Disposition: attachment; filename="' .$filename. '"'); $is_ie = 'IE'; if ($is_ie == 'IE') { header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); } else { header('Pragma: no-cache'); header('Last-Modified: ' . gmdate('D, d M Y H:i:s') . ' GMT'); } echo $result; exit(); } private function setVar(){ $this->url = $this->isValidUrl($_GET['url'])==false ? '' : $_GET['url']; if($this->url=='') echo "<script> alert('url错误') </script>"; $this->sonUrl = $_GET['sonUrl']; $this->list = $_GET['list']; $this->titleTag = $_GET['titleTag']; $this->conTag = $_GET['conTag']; $this->br = $_GET['br']; } public function isValidUrl($url) { $patern = '/^http[s]?:\/\/'. '(([0-9]{1,3}\.){3}[0-9]{1,3}'. // IP形式的URL- 199.194.52.184 '|'. // 允许IP和DOMAIN（域名） '([0-9a-z_!~*\'()-]+\.)*'. // 三级域验证- www. '([0-9a-z][0-9a-z-]{0,61})?[0-9a-z]\.'. // 二级域验证 '[a-z]{2,6})'. // 顶级域验证.com or .museum '(:[0-9]{1,4})?'. // 端口- :80 '((\/\?)|'. // 如果含有文件对文件部分进行校验 '(\/[0-9a-zA-Z_!~\*\'\.;\?:@&=\+\$,%#-\/]*)?)$/'; if(!preg_match($patern, $url)) { return false; }else{ return true; } } } $coll = new Collection(); $coll->init();

转载请注明原文地址: https://www.6miu.com/read-23389.html

技术

最新回复(0)