php文本采集类

使用示例
$C = new Collection();
$C->url = 'http://www.douban.com/subject_search?cat=1001&search_text=%E5%B9%B4%E8%BD%BB%E4%BA%BA';
$C->startFlag = '<p class="ul"></p>';
$C->endFlag        = '<div class="paginator">';
$C->init();
$C->regExp = "|<div class=\"m\"><a href=\"(.*)\">(.*)</a></div><p class=\"pl\">(.*)</p>|Uis";
$C->parse();
print_rr($C->result);
*/
         ##################################################
         ##       模块名:php文本采集类                                   ##
         ##       功能描述:                                       ##
         ##       作者:Jufei jufeis@gmail.com                    ##
         ##                         walkpast.com                               ##
         ##       创建日期:2007-3-20                             ##
         ##       修改日期:2006-3-28                             ##
         ##                                                   ##
         ##################################################

class Collection{
//入口 公有
var $url;       //欲分析的url地址
var $content; //读取到的内容
var $regExp; //要获取部分的正则表达式 
var $codeFrom; //原文的编码
var $codeTo; //欲转换的编码
var $timeout;        //采集等待的时间

var $startFlag;       //文章开始采集的标志 默认为0       在进行采集条目时,只对$startFlag 和 $endFlag之间的文字块进行搜索和采集。
var $endFlag;       //文章结束采集的标志 默认为文章末尾 在进行采集条目时,只对$startFlag 和 $endFlag之间的文字块进行搜索和采集。  
var $block;        //$startFlag 和 $endFlag之间的文字块
//出口 私有
var $result;       //输出结果

//初始化收集器
function init(){
       if(empty($url))
       $this->getFile();
       $this->convertEncoding();
}
//采集所需内容
function parse(){
       $this->getBlock();
       preg_match_all($this->regExp, $this->block ,$this->result,PREG_SET_ORDER);
       return $this->block;
}
//错误处理
function error($msg){
       echo $msg;
}
//读取远程网页 如果成功,传回文件;如果失败传回false
         function getFile(){
             $datalines = @file($this->url);
             if(!$datalines){
        $this->error("can't read the url:".$this->url);
                 return false;
       } else {
        $importdata = implode('', $datalines); 
        $importdata = str_replace(array ("\r\n", "\r"), "\n", $importdata);                                        
        $this->content = $importdata;
       }
          }
       //获取所需要的文字块
       function getBlock(){
       if(!empty($this->startFlag))
        $this->block = substr($this->content,strpos($this->content,$this->startFlag));
       if(!empty($this->endFlag))
        $this->block = substr($this->block,0,strpos($this->block,$this->endFlag));
       }
       //内容编码的转换
       function convertEncoding(){
       if(!empty($this->codeTo))
        $this->codeFrom = mb_detect_encoding($this->content);
       //如果给定转换方案,才执行转换。
       if(!empty($this->codeTo))
        $this->content = mb_convert_encoding($this->content,$this->codeTo,$this->codeFrom) or $this->error("can't convert Encoding");
       }
}//end of class

Popularity: -2%

No Responses to “php文本采集类”

Leave a Reply:

Name (required):
Mail (will not be published) (required):
Website:
Comment (required):
XHTML: You can use these tags: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>