php获取搜索引擎爬虫蜘蛛

云游道人 云游道人 2025-08-01 26 阅读 0 评论

直接上代码:

function isbot($tmp) {
   // 定义蜘蛛识别规则集(保持原顺序)
   $rules = [
       ['patterns' => ['compatible; Googlebot/2.1'], 'label' => '谷歌蜘蛛'],
       ['patterns' => ['Googlebot-Mobile'], 'label' => '谷歌蜘蛛'],
       ['patterns' => ['Googlebot-Image'], 'label' => '谷歌图片蜘蛛'],
       ['patterns' => ['Mediapartners-Google'], 'label' => '谷歌广告蜘蛛'],
       ['patterns' => ['Adsbot-Google'], 'label' => '谷歌质量蜘蛛'],
       ['patterns' => ['Googlebot'], 'label' => '谷歌蜘蛛'],
       ['patterns' => ['GoogleOther'], 'label' => '谷歌蜘蛛'],
       ['patterns' => ['Baiduspider-mobile'], 'label' => '百度蜘蛛'],
       ['patterns' => ['Baidu-Thumbnail'], 'label' => '百度图片蜘蛛'],
       ['patterns' => ['Baiduspider-image'], 'label' => '百度图片蜘蛛'],
       ['patterns' => ['Baiduspider-news'], 'label' => '百度新闻蜘蛛'],
       ['patterns' => ['Baiduspider-video'], 'label' => '百度视频蜘蛛'],
       ['patterns' => ['Baidu-Transcoder'], 'label' => '百度音乐蜘蛛'],
       ['patterns' => ['baiduspider-mobile-gate'], 'label' => '百度移动蜘蛛'],
       ['patterns' => ['Baiduspider'], 'label' => '百度蜘蛛'],
       ['patterns' => ['Sosospider'], 'label' => '搜搜蜘蛛'],
       ['patterns' => ['Sosoimagespider'], 'label' => '搜搜图片蜘蛛'],
       ['patterns' => ['Yahoo! Slurp China'], 'label' => '雅虎中文蜘蛛'],
       ['patterns' => ['Yahoo ContentMatch Crawler'], 'label' => '雅虎竞价蜘蛛'],
       ['patterns' => ['Yahoo-MMCrawler'], 'label' => '雅虎图片蜘蛛'],
       ['patterns' => ['Yahoo! Slurp'], 'label' => '雅虎英文蜘蛛'],
       ['patterns' => ['msnbot'], 'label' => '微软蜘蛛'],
       ['patterns' => ['msnbot-media'], 'label' => '微软媒体蜘蛛'],
       ['patterns' => ['MSNBot-Media'], 'label' => '微软多媒体蜘蛛'],
       ['patterns' => ['MSNBot-NewsBlogs'], 'label' => '微软新闻及blog蜘蛛'],
       ['patterns' => ['MSNBot-Academic'], 'label' => '微软学术蜘蛛'],
       ['patterns' => ['MSNBot'], 'label' => '微软网页蜘蛛'],
       ['patterns' => ['Sosospider'], 'label' => '360蜘蛛'], // 注意:此规则在搜搜之后
       ['patterns' => ['360Spider'], 'label' => '360蜘蛛'], // 注意:此规则在搜搜之后
       ['patterns' => ['YodaoBot', 'OutfoxBot'], 'label' => '有道蜘蛛'],
       ['patterns' => ['Sogou web spider', 'Sogou Orion spider'], 'label' => '搜狗蜘蛛'],
       ['patterns' => ['Sogou inst spider'], 'label' => '搜狗蜘蛛'],
       ['patterns' => ['Sogou News Spider'], 'label' => '搜狗新闻蜘蛛'],
       ['patterns' => ['Sogou spider2'], 'label' => '搜狗蜘蛛'],
       ['patterns' => ['Sogou blog'], 'label' => '搜狗blog蜘蛛'],
       ['patterns' => ['sogou spider'], 'label' => '搜狗蜘蛛'],
       ['patterns' => ['bingbot'], 'label' => '必应蜘蛛'],
       ['patterns' => ['EtaoSpider'], 'label' => '一淘网蜘蛛'],
       ['patterns' => ['Scooter'], 'label' => 'Altavista蜘蛛'],
       ['patterns' => ['Lycos_Spider'], 'label' => 'Lycos蜘蛛'],
       ['patterns' => ['FAST-WebCrawler'], 'label' => 'Alltheweb蜘蛛'],
       ['patterns' => ['Slurp ASPSeek ASPSeek'], 'label' => 'INKTOMI蜘蛛'],
       ['patterns' => ['lanshanbot'], 'label' => '东方网景爬虫'],
       ['patterns' => ['BSpider'], 'label' => '日本爬虫'],
       ['patterns' => ['fast-webcrawler'], 'label' => 'fast-webcrawler'],
       ['patterns' => ['Gaisbot'], 'label' => 'Gaisbot'],
       ['patterns' => ['ia_archiver'], 'label' => 'Alexa蜘蛛'],
       ['patterns' => ['altavista'], 'label' => 'altavista爬虫'],
       ['patterns' => ['lycos_spider'], 'label' => 'Lycos蜘蛛'],
       ['patterns' => ['Inktomi slurp'], 'label' => 'Inktomi slurp'],
       ['patterns' => ['YandexBot'], 'label' => 'Yandex蜘蛛'],
       ['patterns' => ['AhrefsBot'], 'label' => 'AhrefsBot'],
       ['patterns' => ['ezooms.bot'], 'label' => 'ezooms.bot'],
       ['patterns' => ['YisouSpider'], 'label' => '神马搜索'],
       ['patterns' => ['MJ12bot'], 'label' => 'Majestic爬虫'],
       ['patterns' => ['SemrushBot'], 'label' => 'Semrush爬虫'],
       ['patterns' => ['DuckDuckBot'], 'label' => 'DuckDuckGo蜘蛛'],
       ['patterns' => ['facebookexternalhit'], 'label' => 'Facebook爬虫'],
       ['patterns' => ['Twitterbot'], 'label' => 'Twitter爬虫'],
       ['patterns' => ['LinkedInBot'], 'label' => 'LinkedIn爬虫'],
       ['patterns' => ['Pinterestbot'], 'label' => 'Pinterest爬虫'],
       ['patterns' => ['DotBot'], 'label' => 'DotNet爬虫'],
       ['patterns' => ['PetalBot'], 'label' => 'Petal爬虫'],
       ['patterns' => ['Exabot'], 'label' => 'Exalead爬虫'],
       ['patterns' => ['SeznamBot'], 'label' => 'Seznam爬虫'],
       ['patterns' => ['Slurp'], 'label' => 'Yahoo爬虫'],
       ['patterns' => ['rogerbot'], 'label' => 'Moz爬虫'],
       ['patterns' => ['Nimbostratus'], 'label' => 'CloudFlare爬虫'],
   ];

   foreach ($rules as $rule) {
       foreach ($rule['patterns'] as $pattern) {
           if (stripos($tmp, $pattern) !== false) {
               return $rule['label'];
           }
       }
   }

   return '';
}

以下是判断客户端访问类型:

function ClientType() {
   // 安全获取 HTTP 头部信息
   $userAgent = $_SERVER['HTTP_USER_AGENT'] ?? '';
   $httpAccept = $_SERVER['HTTP_ACCEPT'] ?? '';

   // 优先检测蜘蛛类型
   if ($botType = isbot($userAgent)) {
       return "蜘蛛:" . $botType;
   }

   // 非蜘蛛设备分类逻辑
   if (stripos($httpAccept, 'text/html') !== false) {
       return "访客";
   }

   // API 客户端、爬虫工具等特殊类型
   // 常见爬虫工具列表
   $crawlerTools = [
       'curl' => 'cURL命令行工具',
       'wget' => 'Wget下载工具',
       'python' => 'Python爬虫',
       'java' => 'Java爬虫',
       'php' => 'PHP爬虫',
       'perl' => 'Perl爬虫',
       'ruby' => 'Ruby爬虫',
       'go-http-client' => 'Go爬虫',
       'node-fetch' => 'Node.js爬虫',
       'libwww' => 'libwww-perl工具',
       'okhttp' => 'OkHttp客户端',
       'http-client' => 'HTTP客户端',
       'apache-httpclient' => 'Apache HTTP客户端',
       'axios' => 'Axios HTTP客户端',
   ];
   // 检查爬虫工具
 foreach ($crawlerTools as $key => $name) {
       if (stripos($userAgent, $key) !== false) {
           return $name;
       }
   }

   // 默认未知类型
   return "未知";
}


附件下载
上一篇 下一篇

相关阅读

发表评论

访客 访客
快捷回复: 表情:
评论列表 (有 0 条评论,26人围观)