php采集远程文章简单类
时间:2015-05-08 00:28:53浏览:4113
<?php
/**
* 采集类
* @author Milkcy QQ:9877633
* @copyright (C) 2012-2015 TCCMS.COM
* @lastmodify 2012-07-10 14:00
*/
class gather {
public $pagestring = '';
private $db;
function __construct() {
global $db;
$this->db = $db;
}
function geturlfile($url) {
$url = trim($url);
$content = '';
if (extension_loaded('curl')) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$content = curl_exec($ch);
curl_close($ch);
} else {
$content = file_get_contents($url);
}
return trim($content);
}
function get_all_url($code) {
preg_match_all('/<a.+?href=[\"|']?([^>\"' ]+)[\"|']?s*[^>]*>([^>]+)</a>/is', $code, $arr);
return array('name' => $arr[2], 'url' => $arr[1]);
}
function get_sub_content($str, $start, $end) {
$start = trim($start);
$end = trim($end);
if ($start == '' || $end == '') {
return $str;
}
$str = explode($start, $str);
$str = explode($end, $str[1]);
return $str[0];
}
function vd($var) {
echo \"<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">
\";
echo \"<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">
\";
var_dump($var);
echo \"
</code></pre>
\";
echo \"</div>\";
}
}
?>
<?php
define('ROOT_PATH', str_replace('\\', '/', dirname(__FILE__)));
include ROOT_PATH.\"/gather.class.php\";
set_time_limit(0);
header(\"Content-type: text/html; charset=gb2312\");
//目标网址
$url = 'http://news.163.com/special/00013C0O/guojibjtj_03.html';
//实例化采集机器
$gather = new gather();
//获取目标网址HTML
$html = $gather->geturlfile($url);
//定义采集列表区间
$start = '<div class=\"bd clearfix\">';
$end = '<div class=\"pages-1 mt25\">';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$newsAry = $gather->get_all_url($code);
//打印出结果
//$gather->vd($newsAry);
$tarGetUrl = $newsAry['url'][0];
//获取目标网址HTML
$html = $gather->geturlfile($tarGetUrl);
//定义采集列表区间
$start = '<div id=\"endText\">';
$end = '<span class=\"cDGray right\" style=\"white-space:nowrap;\">';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$killHtml = '<iframe src=\"http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1\" width=\"200\" height=\"300\" frameborder=\"no\" border=\"0\" marginwidth=\"0\" marginheight=\"0\" scrolling=\"no\"></iframe>';
$killHtml2 = '<a href=\"http://news.163.com/\"><img src=\"http://img1.cache.netease.com/cnews/img07/end_i.gif\" alt=\"netease\" width=\"12\" height=\"11\" border=\"0\" class=\"icon\" /></a>';
$code = str_replace($killHtml, \"\", $code);
$code = str_replace($killHtml2, \"\", $code);
$gather->vd($code);
?>
文件gather.class.php
1
应用文件:
1上一篇:UTF-8编码ASP页面 输出 GB2312 (GBK)编码的变量 超级简单
下一篇:php单引号和双引号导致Mysql操作失败的问题
- Linux文章
- PHP文章
- 随机文章
- Linux中的find(-atime...
- mysql的expire_logs_...
- PHP 扩展 libsodium s...
- Linux下利用find和cp实现筛...
- 使用mysqldump命令导出备份m...
- Linux系统如何设置开机自动运行脚...
- Linux上实现秒级执行的定时任务
- shell echo -e 颜色输出
- Linux下通过grep查找指定的进...
- 解决执行脚本报syntax erro...
发表评论
昵称: 验证码: