Digg StumbleUpon LinkedIn YouTube Flickr Facebook Twitter RSS Reset

php 版蜘蛛爬行记录 存储为json格式

写入记录,写入的时候也会做一个判断,如果日志文件大于5M就会自动删除一次文件

$ServerName = $_SERVER["SERVER_NAME"] ;     //受访服务器域
$ServerPort = $_SERVER["SERVER_PORT"] ;     //受访服务器端口
$ScriptName = $_SERVER["SCRIPT_NAME"] ;     //受访文件
$QueryString = $_SERVER["QUERY_STRING"]  ;    
$robotsip = $_SERVER["REMOTE_ADDR"] ;       //获取客户端IP  

$GetLocationURL=curPageURL(); //通过curPageURL函数获取完整受访路径
$agent1 = $_SERVER["HTTP_USER_AGENT"] ;  //客户端浏览器信息/蜘蛛信息
$agent=strtolower($agent1);    //转化字符串为小写

if (preg_match("/(googlebot|Mediapartners-Google|baiduspider|yahoo|msnbot|iaarchiver|sogou|sosospider|sohu|yodao)/i",$agent,$matches)){
	switch($matches[0]){
		case "googlebot" : $robots = "Google";	
		break;
		case "Mediapartners-Google" : $robots = "Adsense";
		break;
		case "baiduspider" : $robots = "百度";	
		break;
		case "yahoo" : $robots = "雅虎";	
		break;
		case "msnbot" : $robots = "MSN";	
		break;
		case "iaarchiver" : $robots = "Alexa";	
		break;
		case "sogou" : $robots = "搜狗";	
		break;
		case "sosospider" : $robots = "搜搜";	
		break;
		case "sohu" : $robots = "搜狐";	
		break;
		case "yodao" : $robots = "有道";	
		break;
		default : $robots='123';
	}
}

if ($robots){
	$filename="data/spider_agent.txt";
	//文件体积大于5M时清除
	if(floor(filesize($filename)/(1024*1024))>5){
		@unlink($filename);
	}
	$insertsql=array(
		'spider_name'=>$robots,
		'spider_ip'=>$robotsip,
		'url'=>$GetLocationURL,
		'time'=>time()
	);
	$contents=json_encode($insertsql).",";
	$file=fopen($filename, "ab");
	@fwrite($file,$contents);
	fclose($file);
}

function curPageURL() {
 $pageURL = 'http';
 if (isset($_SERVER["HTTPS"])&&$_SERVER["HTTPS"] == "on") {$pageURL .= "s";}
 $pageURL .= "://";
 if ($_SERVER["SERVER_PORT"] != "80") {
  $pageURL .= $_SERVER["SERVER_NAME"].":".$_SERVER["SERVER_PORT"].$_SERVER["REDIRECT_URL"];
 } else {
  $pageURL .= $_SERVER["SERVER_NAME"].$_SERVER['REQUEST_URI'];
 }
 return $pageURL;
}

取出记录:

套用到输出时可以使用array_slice()函数进行分页显示,

$file = file_get_contents("data/spider_agent.txt");
$agent='{"agent":[';
$agent.=rtrim($file, ',');
$agent.="]}";
$content=json_decode($agent,true);

有什么问题问题留言吧!!!

No comments yet.

Leave a Comment

You must be logged in to post a comment.