# Crawlers Setup User-agent: * # Directories Disallow: /wp-admin/ Disallow: /wp-includes/ Disallow: /404/ Disallow: /app/ Disallow: /cgi-bin/ Disallow: /downloader/ Disallow: /errors/ Disallow: /includes/ Disallow: /js/ Disallow: /lib/ Disallow: /magento/ #Disallow: /media/ Disallow: /pkginfo/ Disallow: /report/ Disallow: /scripts/ Disallow: /shell/ Disallow: /skin/ Disallow: /stats/ Disallow: /var/ # Paths (clean URLs) Disallow: /index.php/ Disallow: /catalog/product_compare/ Disallow: /catalog/category/view/ Disallow: /catalog/product/view/ Disallow: /catalogsearch/ Disallow: /checkout/ Disallow: /contacts/ Disallow: /customer/ Disallow: /customize/ Disallow: /newsletter/ Disallow: /poll/ Disallow: /review/ Disallow: /sendfriend/ Disallow: /tag/ Disallow: /wishlist/ Disallow: /catalog/product/gallery/ Disallow: /productalert/ Disallow: /sendfriend/ Disallow: /directory/ # Files Disallow: /cron.php Disallow: /cron.sh Disallow: /sheduler_cron.sh Disallow: /error_log Disallow: /install.php Disallow: /LICENSE.html Disallow: /LICENSE.txt Disallow: /LICENSE_AFL.txt Disallow: /STATUS.txt # Paths (no clean URLs) Disallow: /*.js$ Disallow: /*.css$ Disallow: /*.php$ Disallow: /*?p=*& Disallow: /*?limit=* Disallow: /*?dir=* Disallow: /*?order=* Disallow: /*?l=* Disallow: /*?SID= # # Google Image Crawler Setup - having crawler-specific sections makes it ignore generic e.g * User-agent: Googlebot-Image Disallow: User-agent: Googlebot Disallow: # # Yandex tends to be rather aggressive, may be worth keeping them at arms lenght User-agent: YandexBot Crawl-delay: 20 User-agent: Pinterest Crawl-delay: 1 # # Crawlers Setup User-agent: * # Block Ahrefs User-agent: AhrefsBot Disallow: / # Block SEOkicks User-agent: SEOkicks-Robot Disallow: / # Block SISTRIX User-agent: SISTRIX Crawler Disallow: / # Block Uptime robot User-agent: UptimeRobot/2.0 Disallow: / User-agent: 008 Disallow: / # Block Ezooms Robot User-agent: Ezooms Robot Disallow: / # Block Perl LWP User-agent: Perl LWP Disallow: / # Block BlexBot User-agent: BLEXBot Disallow: / # Block netEstate NE Crawler (+http://www.website-datenbank.de/) User-agent: netEstate NE Crawler (+http://www.website-datenbank.de/) Disallow: / # Block WiseGuys Robot User-agent: WiseGuys Robot Disallow: / # Block Turnitin Robot User-agent: Turnitin Robot Disallow: / User-agent: TurnitinBot Disallow: / User-agent: Turnitin Bot Disallow: / User-agent: TurnitinBot/3.0 (http://www.turnitin.com/robot/crawlerinfo.html) Disallow: / User-agent: TurnitinBot/3.0 Disallow: / # Block Heritrix User-agent: Heritrix Disallow: / # Block pricepi User-agent: pimonster Disallow: / User-agent: Pimonster Disallow: / # Block Searchmetrics Bot User-agent: SearchmetricsBot Disallow: / # Block Eniro User-agent: ECCP/1.0 (search@eniro.com) Disallow: / # Block Baidu User-agent: Baiduspider User-agent: Baiduspider-video User-agent: Baiduspider-image User-agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html) User-agent: Mozilla/5.0 (compatible; Baiduspider/3.0; +http://www.baidu.com/search/spider.html) User-agent: Mozilla/5.0 (compatible; Baiduspider/4.0; +http://www.baidu.com/search/spider.html) User-agent: Mozilla/5.0 (compatible; Baiduspider/5.0; +http://www.baidu.com/search/spider.html) User-agent: Baiduspider/2.0 User-agent: Baiduspider/3.0 User-agent: Baiduspider/4.0 User-agent: Baiduspider/5.0 Disallow: / # Block SoGou User-agent: Sogou Spider Disallow: / # Block Youdao User-agent: YoudaoBot Disallow: / # Block Nikon JP Crawler User-agent: gsa-crawler (Enterprise; T4-KNHH62CDKC2W3; gsa_manage@nikon-sys.co.jp) Disallow: / # Block MegaIndex.ru User-agent: MegaIndex.ru/2.0 Disallow: / User-agent: MegaIndex.ru Disallow: / User-agent: megaIndex.ru Disallow: / User-agent: Mail.RU_Bot/2.0 Disallow: / User-agent: Mail.RU Disallow: / User-agent: Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots Disallow: / User-Agent: MJ12bot Disallow: / User-Agent: MJ12bot/v1.4.3 Disallow: / Crawl-delay: 30 Crawlers that are kind enough to obey, but which we'd rather not have # unless they're feeding search engines. User-agent: UbiCrawler Disallow: / User-agent: DOC Disallow: / User-agent: Zao Disallow: / User-agent: Twiceler Disallow: / # Some bots are known to be trouble, particularly those designed to copy # entire sites or download them for offline viewing. Please obey robots.txt. # User-agent: sitecheck.internetseer.com Disallow: / User-agent: Zealbot Disallow: / User-agent: MSIECrawler Disallow: / User-agent: SiteSnagger Disallow: / User-agent: WebStripper Disallow: / User-agent: WebCopier Disallow: / User-agent: Fetch Disallow: / User-agent: Offline Explorer Disallow: / User-agent: Teleport Disallow: / User-agent: TeleportPro Disallow: / User-agent: WebZIP Disallow: / User-agent: linko Disallow: / User-agent: HTTrack Disallow: / User-agent: Microsoft.URL.Control Disallow: / User-agent: Xenu Disallow: / User-agent: larbin Disallow: / User-agent: libwww Disallow: / User-agent: ZyBORG Disallow: / User-agent: Download Ninja Disallow: / User-agent: Nutch Disallow: / User-agent: spock Disallow: / User-agent: OmniExplorer_Bot Disallow: / User-agent: BecomeBot Disallow: / User-agent: genieBot Disallow: / User-agent: PetalBot Disallow: / User-agent: dotbot Disallow: / User-agent: MLBot Disallow: / User-agent: 80bot Disallow: / User-agent: Linguee Bot Disallow: / User-agent: aiHitBot Disallow: / User-agent: Exabot Disallow: / User-agent: SBIder/Nutch Disallow: / User-agent: Jyxobot Disallow: / User-agent: mAgent Disallow: / User-agent: Speedy Spider Disallow: / User-agent: ShopWiki Disallow: / actionData: User-agent: Huasai Disallow: / User-agent: DataCha0s Disallow: / User-agent: Baiduspider Disallow: / User-agent: Atomic_Email_Hunter Disallow: / User-agent: Mp3Bot Disallow: / User-agent: WinHttp Disallow: / User-agent: betaBot Disallow: / User-agent: core-project Disallow: / User-agent: panscient.com Disallow: / User-agent: Java Disallow: / User-agent: libwww-perl Disallow: /