function getAbsolutePath($path,$host,$prefix='') {
if(strtolower(substr($path,0,7))=='http://') return $path;
$path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path);
$parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen'); //过滤掉空。。
$absolutes = array();
foreach ($parts as $part) {
if ('.' == $part) continue;
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
$end='';
if(strlen($path)>1 && substr($path,-1)==DIRECTORY_SEPARATOR) $end=DIRECTORY_SEPARATOR; //strlen $path >1 是为了针对传入一个/的情况,会返回//,所以限制一下!
if(strlen($path)>1 && substr($path,1)==DIRECTORY_SEPARATOR) $prefix=''; //不算前缀
return $host.$prefix.DIRECTORY_SEPARATOR.implode(DIRECTORY_SEPARATOR, $absolutes).$end;
}
function getNodes($node,$type='a',$uri)
{
$GLOBALS['node_tmp']=array();
_callBackGetNode($node,$type,$uri);
$tmp=$GLOBALS['node_tmp'];
unset($GLOBALS['node_tmp']);
return $tmp;
}
function _callBackGetNode($node,$type,$uri) {
if($node->name ==$type ) {
$n=$type=='a'?strtolower($node->attribute['href']):strtolower($node->attribute['src']);
$n=getAbsolutePath($n,$uri['scheme'].'://'.$uri['host'],$uri['path']);
$GLOBALS['node_tmp'][$n]=1;
}
if(is_object($node) && $node->hasChildren()) {
foreach($node->child as $child) {
_callBackGetNode($child,$type,$uri);
}
}
}
function getHtml($url,$type='a') {
$getUrl=parse_url($url);
$html=tidy_parse_string(file_get_contents($url));
return getNodes($html->html(),$type,$getUrl);
}
php网页抓取+分析
- No comments yet.
- No trackbacks yet.
Recent Comments