176 lines
3.7 KiB
Plaintext
176 lines
3.7 KiB
Plaintext
# robots.txt for http://www.wikipedia.org/ and friends
|
|
#
|
|
# Please note: There are a lot of pages on this site, and there are
|
|
# some misbehaved spiders out there that go _way_ too fast. If you're
|
|
# irresponsible, your access to the site may be blocked.
|
|
#
|
|
|
|
# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
|
|
# and ignoring 429 ratelimit responses, claims to respect robots:
|
|
# http://mj12bot.com/
|
|
User-agent: MJ12bot
|
|
Disallow: /
|
|
|
|
# advertising-related bots:
|
|
User-agent: Mediapartners-Google*
|
|
Disallow: /
|
|
|
|
# Wikipedia work bots:
|
|
User-agent: IsraBot
|
|
Disallow:
|
|
|
|
User-agent: Orthogaffe
|
|
Disallow:
|
|
|
|
# Crawlers that are kind enough to obey, but which we'd rather not have
|
|
# unless they're feeding search engines.
|
|
User-agent: UbiCrawler
|
|
Disallow: /
|
|
|
|
User-agent: DOC
|
|
Disallow: /
|
|
|
|
User-agent: Zao
|
|
Disallow: /
|
|
|
|
# Some bots are known to be trouble, particularly those designed to copy
|
|
# entire sites. Please obey robots.txt.
|
|
User-agent: sitecheck.internetseer.com
|
|
Disallow: /
|
|
|
|
User-agent: Zealbot
|
|
Disallow: /
|
|
|
|
User-agent: MSIECrawler
|
|
Disallow: /
|
|
|
|
User-agent: SiteSnagger
|
|
Disallow: /
|
|
|
|
User-agent: WebStripper
|
|
Disallow: /
|
|
|
|
User-agent: WebCopier
|
|
Disallow: /
|
|
|
|
User-agent: Fetch
|
|
Disallow: /
|
|
|
|
User-agent: Offline Explorer
|
|
Disallow: /
|
|
|
|
User-agent: Teleport
|
|
Disallow: /
|
|
|
|
User-agent: TeleportPro
|
|
Disallow: /
|
|
|
|
User-agent: WebZIP
|
|
Disallow: /
|
|
|
|
User-agent: linko
|
|
Disallow: /
|
|
|
|
User-agent: HTTrack
|
|
Disallow: /
|
|
|
|
User-agent: Microsoft.URL.Control
|
|
Disallow: /
|
|
|
|
User-agent: Xenu
|
|
Disallow: /
|
|
|
|
User-agent: larbin
|
|
Disallow: /
|
|
|
|
User-agent: libwww
|
|
Disallow: /
|
|
|
|
User-agent: ZyBORG
|
|
Disallow: /
|
|
|
|
User-agent: Download Ninja
|
|
Disallow: /
|
|
|
|
# Misbehaving: requests much too fast:
|
|
User-agent: fast
|
|
Disallow: /
|
|
|
|
#
|
|
# Sorry, wget in its recursive mode is a frequent problem.
|
|
# Please read the man page and use it properly; there is a
|
|
# --wait option you can use to set the delay between hits,
|
|
# for instance.
|
|
#
|
|
User-agent: wget
|
|
Disallow: /
|
|
|
|
#
|
|
# The 'grub' distributed client has been *very* poorly behaved.
|
|
#
|
|
User-agent: grub-client
|
|
Disallow: /
|
|
|
|
#
|
|
# Doesn't follow robots.txt anyway, but...
|
|
#
|
|
User-agent: k2spider
|
|
Disallow: /
|
|
|
|
#
|
|
# Hits many times per second, not acceptable
|
|
# http://www.nameprotect.com/botinfo.html
|
|
User-agent: NPBot
|
|
Disallow: /
|
|
|
|
# A capture bot, downloads gazillions of pages with no public benefit
|
|
# http://www.webreaper.net/
|
|
User-agent: WebReaper
|
|
Disallow: /
|
|
|
|
#
|
|
# Friendly, low-speed bots are welcome viewing article pages, but not
|
|
# dynamically-generated pages please.
|
|
#
|
|
# Inktomi's "Slurp" can read a minimum delay between hits; if your
|
|
# bot supports such a thing using the 'Crawl-delay' or another
|
|
# instruction, please let us know.
|
|
#
|
|
# There is a special exception for API mobileview to allow dynamic
|
|
# mobile web & app views to load section content.
|
|
# These views aren't HTTP-cached but use parser cache aggressively
|
|
# and don't expose special: pages etc.
|
|
#
|
|
# Another exception is for REST API documentation, located at
|
|
# /api/rest_v1/?doc.
|
|
#
|
|
User-agent: *
|
|
Allow: /w/api.php?action=mobileview&
|
|
Allow: /w/load.php?
|
|
Allow: /api/rest_v1/?doc
|
|
Allow: /w/images/
|
|
Allow: /
|
|
Disallow: /w/
|
|
Disallow: /api/
|
|
Disallow: /trap/
|
|
Disallow: /wiki/Special:
|
|
Disallow: /wiki/Special%3A
|
|
|
|
# https://phabricator.wikimedia.org/T16075
|
|
Disallow: /wiki/MediaWiki:Spam-blacklist
|
|
Disallow: /wiki/MediaWiki%3ASpam-blacklist
|
|
Disallow: /wiki/MediaWiki_talk:Spam-blacklist
|
|
Disallow: /wiki/MediaWiki_talk%3ASpam-blacklist
|
|
#
|
|
Disallow: /wiki/Template%3AEditnotices
|
|
#
|
|
Disallow: /wiki/Category:Noindexed_pages
|
|
Disallow: /wiki/Category%3ANoindexed_pages
|
|
#
|
|
# User sandboxes for modules and Template Styles are placed in these subpages for testing
|
|
#
|
|
Disallow: /wiki/Module:Sandbox
|
|
Disallow: /wiki/Module%3ASandbox
|
|
Disallow: /wiki/Template:TemplateStyles_sandbox
|
|
Disallow: /wiki/Template%3ATemplateStyles_sandbox
|