ThinkPHP类库-Org类库-CollectPage(单页面采集)
2026-04-26 21:47:16
213
分类:php
用于单个页面的所有资源采集,包括css,img,js的下载并还原路径,能抓取大部分的网站,是一个做仿站的快捷方法。
由于html代码中引用路径有绝对路径,相对路径等,所以程序只能适应于部分页面的采集。css中引用的图片也未采集。
<?php
// +----------------------------------------------------------------------
// | Author: 栉风沐雨 <188150920@qq.com>
// +----------------------------------------------------------------------
// | CollectPage.class.php 创建时间:2018/05/01
// | 最后修改时间:2018-07-20
// +----------------------------------------------------------------------
/**
* 单页面整页抓取(包括css,js,img的下载和路径还原),能抓取大部分网站
* 目前只能抓取三种路径:
* 绝对路径(带域名的、直接/开头的)
* 相对路径(不带../的)
*/
class CollectPage
{
private $debug;
private $file; //文件操作类
private $content; //抓取内容
private $parse_url; //路径解析方式1
private $pathinfo; //路径解析方式2
private $rootPath; //保存跟路径
public function __construct($url, $rootPath = './Resources/station', $debug = true)
{
if ($url == '') {
return false;
}
$this->debug = $debug;
import("ORG.QueryList");
$this->file = new \Think\File();
$this->rootPath = $rootPath;
$this->init($url);
}
private function init($url)
{
$this->parse_url = parse_url($url);
$this->pathinfo = pathinfo($url);
$ql = new \QueryList($url);
$this->content = $ql->getHtml();
//css
if ($this->debug) {
echo '<p style="color:green">采集css</p>';
ob_flush();
flush();
}
$ql->setQuery(array (
'css' => array ('link', 'href'),
));
$csslist = $ql->jsonArr;
foreach ($csslist as $value) {
$_url = $value['css'];
if (!$_url) {
continue;
}
$this->parseUrl($_url);
}
//js
if ($this->debug) {
echo '<p style="color:green">采集js</p>';
ob_flush();
flush();
}
$ql->setQuery(array (
'js' => array ('script', 'src'),
));
$jslist = $ql->jsonArr;
foreach ($jslist as $value) {
$_url = $value['js'];
if (!$_url) {
continue;
}
$this->parseUrl($_url);
}
//img
if ($this->debug) {
echo '<p style="color:green">采集img</p>';
ob_flush();
flush();
}
$ql->setQuery(array (
'img' => array ('img', 'src')
));
$imglist = $ql->jsonArr;
foreach ($imglist as $value) {
$_url = $value['img'];
if (!$_url) {
continue;
}
$this->parseUrl($_url);
}
$this->file->createFolder($this->rootPath);
$this->file->writeFile($this->rootPath . '/index.html', $this->content);
if ($this->debug) {
echo '<p style="color:green;">抓取完毕</p>';
}
}
//解析路径
private function parseUrl($url, $id)
{
$parse_url = parse_url($url);
$scheme = $parse_url['scheme']; //http协议头
$host = $parse_url['host']; //域名
$path = $parse_url['path']; //路径
if ($host) {
//绝对路径(实测可用)
$resPath = $url;
if ($scheme) {
//包含http头
$savePath = str_replace($scheme . '://' . $host, '', $path);
}
else {
//不包含http头
$resPath = $this->parse_url['scheme'] . ':' . $resPath;
$savePath = str_replace('//' . $host, '', $path);
}
}
else {
//绝对路径(不包含http和host)
if (substr($path, 0, 1) == '/') {
//(/path)(实测可用)
$resPath = $this->parse_url['scheme'] . '://' . $this->parse_url['host'] . $url;
$savePath = $url;
}
else if (substr($path, 0, 2) == '..') {
//(../)
echo('<p class="color:red">这种情况还没做呢</p>>');
}
else if (substr($path, 0, 2) == './') {
//(./)
$_url = substr($url, 1);
$resPath = $this->pathinfo['dirname'] . $_url;
$savePath = $_url;
}
else {
//(path/)
$resPath = $this->pathinfo['dirname'] . '/' . $url;
$savePath = '/' . $url;
}
}
//清除参数
if ($parse_url['query']) {
$savePath = $this->file->clearUrlQuery($savePath);
}
$_savePath = $savePath;
$_savePath = substr($_savePath, 1);
$savePath = $this->rootPath . $savePath;
//已经下载
if (is_file($savePath)) {
return;
}
$filePath = $this->file->getFilePath($savePath);
if (!is_dir($filePath)) {
$this->file->createFolder($filePath);
}
$suf = $this->file->getFileSuffix($resPath);
if ($suf == 'css' || $suf == 'js') {
$this->file->downloadFile($resPath, $savePath);
}
else {
$this->file->curlDownload($resPath, $savePath);
}
$this->content = str_replace($url, $_savePath, $this->content);
if ($this->debug) {
echo '下载路径:' . $resPath . '<br>';
echo '保存路径:' . $savePath . '<br>';
ob_flush();
flush();
}
}
}