PHP统计nginx访问日志中的搜索引擎抓取404链接页面

2019-08-13 18:20 来源:未知

我在服务器上有每天切割nginx日志的习惯,所以针对每天各大搜索引擎来访,总能记录一些404页面信息,传统上我只是偶尔分析下日志,但是对于很多日志信息的朋友,人工来筛选可能不是一件容易的事情,这不我个人自己慢慢研究了一点点,针对谷歌、百度、搜搜、360搜索、宜搜、搜狗、必应等搜索引擎的404访问生成为一个txt文本文件,直接上代码test.php。

 

复制代码 代码如下:

<?php
function new_htmlspecialchars($string)
{
 return is_array($string) ? array_map('new_htmlspecialchars', $string) : htmlspecialchars($string, ENT_QUOTES);
}

<?php
//访问test.php?s=google
$domain='//www.jb51.net';
$spiders=array('baidu'=>'Baiduspider','360'=>'360Spider',
'google'=>'Googlebot','soso'=>'Sosospider','sogou'=>
'Sogou web spider','easou'=>'EasouSpider','bing'=>'bingbot');
 
$path='/home/nginx/logs/'.date('Y/m/').(date('d')-1).'/access_www.txt';
 
$s=$_GET['s'];
 
if(!array_key_exists($s,$spiders)) die();
$spider=$spiders[$s];
 
$file=$s.'_'.date('ym').(date('d')-1).'.txt';
if(!file_exists($file)){
    $in=file_get_contents($path);
    $pattern='/GET (.*) HTTP/1.1" 404.*'.$spider.'/';
    preg_match_all ( $pattern , $in , $matches );
    $out='';
    foreach($matches[1] as $k=>$v){
        $out.=$domain.$v."rn";
    }
    file_put_contents($file,$out);
}
 
$url=$domain.'/silian/'.$file;
echo $url;

function new_addslashes($string)
{
 if(!is_array($string)) return addslashes($string);
 foreach($string as $key => $val) $string[$key] = new_addslashes($val);
 return $string;
}

好就这样了。没有什么高深的技术,只有动手写的过程。

function new_stripslashes($string)
{
 if(!is_array($string)) return stripslashes($string);
 foreach($string as $key => $val) $string[$key] = new_stripslashes($val);
 return $string;
}

您可能感兴趣的文章:

function filter_xss($string, $allowedtags = '', $disabledattributes = array('onabort', 'onactivate', 'onafterprint', 'onafterupdate', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut', 'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint', 'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'oncellchange', 'onchange', 'onclick', 'oncontextmenu', 'oncontrolselect', 'oncopy', 'oncut', 'ondataavaible', 'ondatasetchanged', 'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin', 'onfocusout', 'onhelp', 'onkeydown', 'onkeypress', 'onkeyup', 'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter', 'onmouseleave', 'onmousemove', 'onmoveout', 'onmouseover', 'onmouseup', 'onmousewheel', 'onmove', 'onmoveend', 'onmovestart', 'onpaste', 'onpropertychange', 'onreadystatechange', 'onreset', 'onresize', 'onresizeend', 'onresizestart', 'onrowexit', 'onrowsdelete', 'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange', 'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onunload'))
{
 if(is_array($string))
 {
  foreach($string as $key => $val) $string[$key] = filter_xss($val, ALLOWED_HTMLTAGS);
 }
 else
 {
  $string = preg_replace('/s('.implode('|', $disabledattributes).').*?([s>])/', '\2', preg_replace('/<(.*?)>/ie', "'<'.preg_replace(array('/javascript:[^"']*/i', '/(".implode('|', $disabledattributes).")[ \t\n]*=[ \t\n]*["'][^"']*["']/i', '/s /'), array('', '', ' '), stripslashes('\1')) . '>'", strip_tags($string, $allowedtags)));
 }
 return $string;
}

function safe_replace($string)
{
 $string = str_replace(' ','',$string);
 $string = str_replace(''','',$string);
 $string = str_replace('*','',$string);
 $string = str_replace('"','"',$string);
 $string = str_replace("'",'',$string);
 $string = str_replace(""",'',$string);
 $string = str_replace('//','',$string);
 $string = str_replace(';','',$string);
 $string = str_replace('<','<',$string);
 $string = str_replace('>','>',$string);
 $string = str_replace('(','',$string);
 $string = str_replace(')','',$string);
 $string = str_replace("{",'',$string);
 $string = str_replace('}','',$string);
 return $string;
}

function filter_word($data = '')
{
 global $PHPCMS;
 $filter_word = trim($PHPCMS['filter_word']);
 if(!$filter_word || (!$data && !$_GET && !$_POST)) return false;
 $filter_word = array_filter(array_map('trim', explode("n", $filter_word)));
    if(!$filter_word) return false;
 $pattern = str_replace('*', '.*', implode('|', array_map('preg_quote', $filter_word)));
 $data = array2string($_REQUEST);
 if($pattern && preg_match("/($pattern)/", $data, $m))
 {
  $pattern_word = $m[0];
  define('ILLEGAL_WORD', $pattern_word);
  unset($m[0]);
  $word = implode(' ', $m);
  $logdata = array(TIME, IP, $word, $pattern_word);
  $logfile = PHPCMS_ROOT.'data/filterlog/'.date('Ym', TIME).'.csv';
  $fp = fopen($logfile, 'a');
  fputcsv($fp, $logdata);
  fclose($fp);
  return true;
 }
 return false;
}

function format_textarea($string)
{
 return nl2br(str_replace(' ', ' ', htmlspecialchars($string)));
}

function format_js($string, $isjs = 1)
{
 $string = addslashes(str_replace(array("r", "n"), array('', ''), $string));
 return $isjs ? 'document.write("'.$string.'");' : $string;
}

if(!function_exists('file_put_contents'))
{
 define('FILE_APPEND', 8);
 function file_put_contents($file, $data, $append = '')
 {
  $mode = $append == '' ? 'wb' : 'ab';
  $fp = @fopen($file, $mode) or exit("Can not open file $file !");
  flock($fp, LOCK_EX);
  $len = @fwrite($fp, $data);
  flock($fp, LOCK_UN);
  @fclose($fp);
  return $len;
 }
}

if(!function_exists('http_build_query'))
{
    function http_build_query($data, $prefix = null, $sep = '', $key = '')
 {
        $ret = array();
  foreach((array)$data as $k => $v)
  {
   $k = urlencode($k);
   if(is_int($k) && $prefix != null)
   {
    $k = $prefix.$k;
   }
   if(!empty($key)) {
    $k = $key."[".$k."]";
   }
   if(is_array($v) || is_object($v))
   {
    array_push($ret,http_build_query($v,"",$sep,$k));
   }
   else
   {
    array_push($ret,$k."=".urlencode($v));
   }
  }
        if(empty($sep))
  {
            $sep = ini_get("arg_separator.output");
        }
        return implode($sep, $ret);
    }
}

if(!function_exists('image_type_to_extension'))
{
    function image_type_to_extension($type, $dot = true)
    {
        $e = array ( 1 => 'gif', 'jpeg', 'png', 'swf', 'psd', 'bmp' ,'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc', 'aiff', 'wbmp', 'xbm');
        $type = intval($type);
        if (!$type)
  {
            trigger_error( 'File Type is null...', E_USER_NOTICE );
            return null;
        }
        if(!isset($e[$type]))
  {
            trigger_error( 'Image type is wrong...', E_USER_NOTICE );
            return null;
        }
        return ($dot ? '.' : '') . $e[$type];
    }
}

if(!function_exists('array_intersect_key'))
{
 function array_intersect_key($isec, $keys)
 {
  $argc = func_num_args();
  if ($argc > 2)
  {
   for ($i = 1; !empty($isec) && $i < $argc; $i )
   {
    $arr = func_get_arg($i);
    foreach (array_keys($isec) as $key)
    {
     if (!isset($arr[$key]))
     {
      unset($isec[$key]);
     }
    }
   }
   return $isec;
  }
  else
  {
   $res = array();
   foreach (array_keys($isec) as $key)
   {
    if (isset($keys[$key]))
    {
     $res[$key] = $isec[$key];
    }
   }
   return $res;
  }
 }
}

if(!function_exists('json_encode'))
{
 function json_encode($string)
 {
  require_once 'json.class.php';
  $json = new json();
  return $json->encode($string);
 }
}

if(!function_exists('json_decode'))
{
 function json_decode($string,$type = 1)
 {
  require_once 'json.class.php';
  $json = new json();
  return $json->decode($string,$type);
 }
}

if(!function_exists('iconv'))
{
 function iconv($in_charset, $out_charset, $str)
 {
  if(function_exists('mb_convert_encoding'))
  {
   return mb_convert_encoding($str, $out_charset, $in_charset);
  }
  else
  {

   require_once 'iconv.func.php';
   $in_charset = strtoupper($in_charset);
   $out_charset = strtoupper($out_charset);
   if($in_charset == 'UTF-8' && ($out_charset == 'GBK' || $out_charset == 'GB2312'))
   {
    return utf8_to_gbk($str);
   }
   if(($in_charset == 'GBK' || $in_charset == 'GB2312') && $out_charset == 'UTF-8')
   {
    return gbk_to_utf8($str);
   }
   return $str;
  }
 }
}

function str_charset($in_charset, $out_charset, $str_or_arr)
{
 if(is_array($str_or_arr))
 {
  foreach($str_or_arr as $k=>$v)
  {
   $str_or_arr[$k] = str_charset($in_charset, $out_charset, $v);
  }
 }
 else
 {
  $str_or_arr = iconv($in_charset, $out_charset, $str_or_arr);
 }
 return $str_or_arr;
}

function stripstr($str)
{
 return str_replace(array('..', "n", "r"), array('', '', ''), $str);
}

if(!function_exists('fputcsv'))
{
 function fputcsv(&$fp, $array, $delimiter = ',', $enclosure = '"')
 {
  $data = $enclosure.implode($enclosure.$delimiter.$enclosure, $array).$enclosure."n";
  return fwrite($fp, $data);
 }
}

function random($length, $chars = '0123456789')
{
 $hash = '';
 $max = strlen($chars) - 1;
 for($i = 0; $i < $length; $i )
 {
  $hash .= $chars[mt_rand(0, $max)];
 }
 return $hash;
}

function set_cookie($var, $value = '', $time = 0)
{
 $time = $time > 0 ? $time : ($value == '' ? PHP_TIME - 3600 : 0);
 $s = $_SERVER['SERVER_PORT'] == '443' ? 1 : 0;
 $var = COOKIE_PRE.$var;
 $_COOKIE[$var] = $value;
 if(is_array($value))
 {
  foreach($value as $k=>$v)
  {
   setcookie($var.'['.$k.']', $v, $time, COOKIE_PATH, COOKIE_DOMAIN, $s);
  }
 }
 else
 {
  setcookie($var, $value, $time, COOKIE_PATH, COOKIE_DOMAIN, $s);
 }
}

function get_cookie($var)
{
 $var = COOKIE_PRE.$var;
 return isset($_COOKIE[$var]) ? $_COOKIE[$var] : false;
}

function content_set($contentid, $field, $data)
{
 return @file_put_contents(content_file($contentid, $field), $data);
}

function content_get($contentid, $field)
{
 return @file_get_contents(content_file($contentid, $field));
}

function content_del($contentid, $field)
{
 return @unlink(content_file($contentid, $field));
}

function content_file($contentid, $field)
{
 $id = str_pad($contentid, 4, '0', STR_PAD_LEFT);
 return CONTENT_ROOT.$field.'/'.substr($id, 0, 2).'/'.substr($id, 2, 2).'/'.$contentid.'.txt';
}

function content_init($field)
{
 @set_time_limit(300);
 @mkdir(CONTENT_ROOT.$field, 0777);
 for($i=1; $i<=9999; $i )
 {
  $id = str_pad($i, 4, '0', STR_PAD_LEFT);
  $dir1 = CONTENT_ROOT.$field.'/'.substr($id, 0, 2);
  $dir2 = $dir1.'/'.substr($id, 2, 2);
  @mkdir($dir1, 0777);
  @mkdir($dir2, 0777);
 }
 return true;
}

function menu($parentid, $code = '')
{
 global $db, $_userid, $_roleid, $_groupid;
 $code = str_replace('"', '"', $code);
 $where = $parentid == 99 ? "AND userid=$_userid" : '';
 $menus = $db->select("SELECT * FROM `".DB_PRE."menu` WHERE `parentid`='$parentid' $where ORDER BY `listorder`,`menuid`", 'menuid');
 if($code)
 {
  foreach($menus as $m)
  {
   extract($m);
   if(($roleids && defined('IN_ADMIN') && !check_in($_roleid, $roleids)) || ($groupids && !defined('IN_ADMIN') && !check_in($_groupid, $groupids))) continue;
   eval("$menu .= "$code";");
  }
  $menus = $menu;
 }
 return $menus;
}

function url($url, $isabs = 0)
{
 if(strpos($url, '://') !== FALSE || $url[0] == '?') return $url;
 if($isabs || defined('SHOWJS'))
 {
  $url = strpos($url, PHPCMS_PATH) === 0 ? SITE_URL.substr($url, strlen(PHPCMS_PATH)) : SITE_URL.$url;
 }
 else
 {
  $url = strpos($url, PHPCMS_PATH) === 0 ? $url : PHPCMS_PATH.$url;
 }
 return $url;
}

function is_ie()
{
 $useragent = strtolower($_SERVER['HTTP_USER_AGENT']);
 if((strpos($useragent, 'opera') !== false) || (strpos($useragent, 'konqueror') !== false)) return false;
 if(strpos($useragent, 'msie ') !== false) return true;
 return false;
}

function is_websearch()
{
 if(!defined('IS_WEBSEARCH'))
 {
  $useragent = strtolower($_SERVER['HTTP_USER_AGENT']);
  $browsers = 'msie|netscape|opera|konqueror|mozilla';
  $spiders = 'bot|spider|google|isaac|surveybot|baiduspider|yahoo|sohu-search|yisou|3721|qihoo|daqi|ia_archiver|p.arthur|fast-webcrawler|java|microsoft-atl-native|turnitinbot|webgather|sleipnir|msn';
  if(preg_match("/($browsers)/", $_SERVER['HTTP_USER_AGENT']))
  {
   define('IS_WEBSEARCH', FALSE);
  }
  elseif(preg_match("/($spiders)/", $_SERVER['HTTP_USER_AGENT']))
  {
   define('IS_WEBSEARCH', TRUE);
  }
  else
  {
   define('IS_WEBSEARCH', FALSE);
  }
 }
 return IS_WEBSEARCH;
}

function is_date($ymd, $sep='-')
{
 if(empty($ymd)) return FALSE;
 list($year, $month, $day) = explode($sep, $ymd);
 return checkdate($month, $day, $year);
}

function is_email($email)
{
 return strlen($email) > 6 && preg_match("/^[w-.] @[w-.] (.w ) $/", $email);
}

function str_exists($haystack, $needle)
{
 return !(strpos($haystack, $needle) === FALSE);
}

function file_down($filepath, $filename = '')
{
 if(!$filename) $filename = basename($filepath);
 if(is_ie()) $filename = rawurlencode($filename);
 $filetype = fileext($filename);
 $filesize = sprintf("%u", filesize($filepath));
 if(ob_get_length() !== false) @ob_end_clean();
 header('Pragma: public');
 header('Last-Modified: '.gmdate('D, d M Y H:i:s') . ' GMT');
 header('Cache-Control: no-store, no-cache, must-revalidate');
 header('Cache-Control: pre-check=0, post-check=0, max-age=0');
 header('Content-Transfer-Encoding: binary');
 header('Content-Encoding: none');
 header('Content-type: '.$filetype);
 header('Content-Disposition: attachment; filename="'.$filename.'"');
 header('Content-length: '.$filesize);
 readfile($filepath);
 exit;
}

function fileext($filename)
{
 return strtolower(trim(substr(strrchr($filename, '.'), 1, 10)));
}

function implodeids($array, $s = ',')
{
 if(empty($array)) return '';
 return is_array($array) ? implode($s, $array) : $array;
}

function check_submit($var)
{
 if(empty($GLOBALS[$var])) return false;
 if(empty($_SERVER['HTTP_REFERER'])) return true;
 return strpos($_SERVER['HTTP_REFERER'], DOMAIN) === 7;
}

function check_in($id, $ids = '', $s = ',')
{
 if(!$ids) return false;
 $ids = explode($s, $ids);
 return is_array($id) ? array_intersect($id, $ids) : in_array($id, $ids);
}

function ip()
{
 if(getenv('HTTP_CLIENT_IP') && strcasecmp(getenv('HTTP_CLIENT_IP'), 'unknown'))
 {
  $ip = getenv('HTTP_CLIENT_IP');
 }
 elseif(getenv('HTTP_X_FORWARDED_FOR') && strcasecmp(getenv('HTTP_X_FORWARDED_FOR'), 'unknown'))
 {
  $ip = getenv('HTTP_X_FORWARDED_FOR');
 }
 elseif(getenv('REMOTE_篮球世界杯赌球,ADDR') && strcasecmp(getenv('REMOTE_ADDR'), 'unknown'))
 {
  $ip = getenv('REMOTE_ADDR');
 }
 elseif(isset($_SERVER['REMOTE_ADDR']) && $_SERVER['REMOTE_ADDR'] && strcasecmp($_SERVER['REMOTE_ADDR'], 'unknown'))
 {
  $ip = $_SERVER['REMOTE_ADDR'];
 }
 return preg_match("/[d.]{7,15}/", $ip, $matches) ? $matches[0] : 'unknown';
}

function ip_banned($ip)
{
 $ips = cache_read('ipbanned.php');
 if(!$ips) return false;
 foreach($ips as $k=>$v)
 {
  if($v < TIME) continue;
  if($ip == $k) return true;
  if(strpos($k, '*'))
  {
   $k = str_replace(array('.', '*'), array('.', '[0-9]{1,3}'), $k);
      if(preg_match("/$v/", $ip)) return true;
  }
 }
 return false;
}

function str_cut($string, $length, $dot = '...')
{
 $strlen = strlen($string);
 if($strlen <= $length) return $string;
 $string = str_replace(array(' ', '&', '"', ''', '“', '”', '—', '<', '>', '·', '…'), array(' ', '&', '"', "'", '“', '”', '—', '<', '>', '·', '…'), $string);
 $strcut = '';
 if(strtolower(CHARSET) == 'utf-8')
 {
  $n = $tn = $noc = 0;
  while($n < $strlen)
  {
   $t = ord($string[$n]);
   if($t == 9 || $t == 10 || (32 <= $t && $t <= 126)) {
    $tn = 1; $n ; $noc ;
   } elseif(194 <= $t && $t <= 223) {
    $tn = 2; $n = 2; $noc = 2;
   } elseif(224 <= $t && $t < 239) {
    $tn = 3; $n = 3; $noc = 2;
   } elseif(240 <= $t && $t <= 247) {
    $tn = 4; $n = 4; $noc = 2;
   } elseif(248 <= $t && $t <= 251) {
    $tn = 5; $n = 5; $noc = 2;
   } elseif($t == 252 || $t == 253) {
    $tn = 6; $n = 6; $noc = 2;
   } else {
    $n ;
   }
   if($noc >= $length) break;
  }
  if($noc > $length) $n -= $tn;
  $strcut = substr($string, 0, $n);
 }
 else
 {
  $dotlen = strlen($dot);
  $maxi = $length - $dotlen - 1;
  for($i = 0; $i < $maxi; $i )
  {
   $strcut .= ord($string[$i]) > 127 ? $string[$i].$string[ $i] : $string[$i];
  }
 }
 $strcut = str_replace(array('&', '"', "'", '<', '>'), array('&', '"', ''', '<', '>'), $strcut);
 return $strcut.$dot;
}

function cache_page_start()
{
 define('CACHE_PAGE_ID', md5(RELATE_URL));
 define('CACHE_PAGE_DIR', CACHE_PAGE_PATH.substr(CACHE_PAGE_ID, 0, 2).'/');
 define('CACHE_PAGE_FILE', CACHE_PAGE_DIR.CACHE_PAGE_ID.'.html');
 $contents = @file_get_contents(CACHE_PAGE_FILE);
 if($contents && intval(substr($contents, 15, 25)) > TIME)
 {
  echo substr($contents, 29);
  exit;
 }
 return true;
}

function cache_page($ttl = CACHE_PAGE_TTL, $isjs = 0)
{
 if($ttl == 0 || !defined('CACHE_PAGE_FILE')) return false;
 $contents = ob_get_contents();
 if($isjs) $contents = format_js($contents);
 dir_create(CACHE_PAGE_DIR);
 $contents = "<!--expiretime:".(TIME $ttl)."-->n".$contents;
 file_put_contents(CACHE_PAGE_FILE, $contents);
 @chmod(CACHE_PAGE_FILE, 0777);
}

function cache_page_clear()
{
 @set_time_limit(600);
 $dirs = glob(CACHE_PAGE_PATH.'*');
 foreach($dirs as $dir)
 {
  $files = glob($dir.'/*');
  foreach($files as $file)
  {
   @unlink($file);
  }
  @rmdir($dir);
 }
}

function cache_count($sql)
{
 global $db, $TEMP;
 $id = md5($sql);
 if(!isset($TEMP['count'][$id]))
 {
  if(CACHE_COUNT_TTL)
  {
   $r = $db->get_one("SELECT `count`,`updatetime` FROM `".DB_PRE."cache_count` WHERE `id`='$id'");
   if(!$r || $r['updatetime'] < TIME - CACHE_COUNT_TTL)
   {
    $r = $db->get_one($sql);
    $TEMP['count'][$id] = $r['count'];
    $db->query("REPLACE INTO `".DB_PRE."cache_count`(`id`, `count`, `updatetime`) VALUES('$id', '".$r['count']."', '".TIME."')");
   }
  }
  else
  {
   $r = $db->get_one($sql);
  }
  $TEMP['count'][$id] = $r['count'];
 }
 return $TEMP['count'][$id];
}

function cache_member()
{
 global $db;
 $status = $db->table_status(DB_PRE.'member_cache');
 if($status['Rows'] == 0)
 {
  @set_time_limit(600);
  $db->query("INSERT INTO `".DB_PRE."member_cache` SELECT * FROM `".DB_PRE."member`");
  return true;
 }
 return false;
}

function cache_read($file, $path = '', $iscachevar = 0)
{
 if(!$path) $path = CACHE_PATH;
 $cachefile = $path.$file;
 if($iscachevar)
 {
  global $TEMP;
  $key = 'cache_'.substr($file, 0, -4);
  return isset($TEMP[$key]) ? $TEMP[$key] : $TEMP[$key] = @include $cachefile;
 }
 return @include $cachefile;
}

function cache_write($file, $array, $path = '')
{
 if(!is_array($array)) return false;
 $array = "<?phpnreturn ".var_export($array, true).";n?>";
 $cachefile = ($path ? $path : CACHE_PATH).$file;
 $strlen = file_put_contents($cachefile, $array);
 @chmod($cachefile, 0777);
 return $strlen;
}

function cache_delete($file, $path = '')
{
 $cachefile = ($path ? $path : CACHE_PATH).$file;
 return @unlink($cachefile);
}

function setting_set($tablename, $where, $setting)
{
 global $db;
 if(!is_array($setting)) return false;
 $setting = new_stripslashes($setting);
 $setting = addslashes(var_export($setting, TRUE));
 return $db->query("UPDATE `$tablename` SET `setting`='$setting' WHERE $where");
}

TAG标签:
版权声明:本文由美洲杯赌球发布于计算机教程,转载请注明出处:PHP统计nginx访问日志中的搜索引擎抓取404链接页面