php网页抓取+分析



function getAbsolutePath($path,$host,$prefix='') {
                if(strtolower(substr($path,0,7))=='http://') return $path;
        $path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path);
        $parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen'); //过滤掉空。。
        $absolutes = array();
        foreach ($parts as $part) {
            if ('.' == $part) continue;
            if ('..' == $part) {
                array_pop($absolutes);
            } else {
                $absolutes[] = $part;
            }
        }
        $end='';
        if(strlen($path)>1 && substr($path,-1)==DIRECTORY_SEPARATOR) $end=DIRECTORY_SEPARATOR;  //strlen $path >1 是为了针对传入一个/的情况,会返回//,所以限制一下!
        if(strlen($path)>1 && substr($path,1)==DIRECTORY_SEPARATOR) $prefix=''; //不算前缀
        return $host.$prefix.DIRECTORY_SEPARATOR.implode(DIRECTORY_SEPARATOR, $absolutes).$end;
}
function getNodes($node,$type='a',$uri)
{
        $GLOBALS['node_tmp']=array();
        _callBackGetNode($node,$type,$uri);
        $tmp=$GLOBALS['node_tmp'];
        unset($GLOBALS['node_tmp']);
        return $tmp;
}
function _callBackGetNode($node,$type,$uri) {
        if($node->name ==$type  ) {
                $n=$type=='a'?strtolower($node->attribute['href']):strtolower($node->attribute['src']);
                $n=getAbsolutePath($n,$uri['scheme'].'://'.$uri['host'],$uri['path']);
                $GLOBALS['node_tmp'][$n]=1;
        }
        if(is_object($node) && $node->hasChildren()) {
                foreach($node->child as $child) {
                        _callBackGetNode($child,$type,$uri);
                }
        }
}
function getHtml($url,$type='a') {
   $getUrl=parse_url($url);
   $html=tidy_parse_string(file_get_contents($url));
   return getNodes($html->html(),$type,$getUrl);
}
  1. No comments yet.
(will not be published)
  1. No trackbacks yet.