Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions Doc/library/urllib.robotparser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,18 @@ structure of :file:`robots.txt` files, see :rfc:`9309`.
.. class:: RobotFileParser(url='')

This class provides methods to read, parse and answer questions about the
:file:`robots.txt` file at *url*.
:file:`robots.txt` file at *url* or a :class:`urllib.request.Request` object.

.. versionchanged:: next
*url* parameter can be a :class:`urllib.request.Request` object.

.. method:: set_url(url)

Sets the URL referring to a :file:`robots.txt` file.
Sets the URL referring to a :file:`robots.txt` file or a
:class:`urllib.request.Request` object.

.. versionchanged:: next
*url* parameter can be a :class:`urllib.request.Request` object.

.. method:: read()

Expand Down Expand Up @@ -102,3 +109,17 @@ class::
True
>>> rp.can_fetch("*", "http://www.pythontest.net/no-robots-here/")
False


The following example demonstrates use of a :class:`urllib.request.Request`
object with additional user-agent headers populated::

>>> import urllib.robotparser
>>> import urllib.request
>>> rp = urllib.robotparser.RobotFileParser()
>>> rp.set_url(urllib.request.Request("http://www.pythontest.net/robots.txt", headers={"User-Agent": "IsraBot"}))
>>> rp.read()
>>> rp.can_fetch("*", "http://www.pythontest.net/")
True
>>> rp.can_fetch("*", "http://www.pythontest.net/no-robots-here/")
False
31 changes: 31 additions & 0 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,37 @@ def testServiceUnavailable(self):
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))


class UserAgentSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):

class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.headers.get('User-Agent').startswith('Python-urllib'):
self.send_error(403, "Forbidden access")
else:
self.send_response(200)
self.end_headers()
self.wfile.write(b"User-agent: *\nDisallow:")

def log_message(self, format, *args):
pass

def testUserAgentFilteringSite(self):
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
file_url = url + "/document"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(robots_url)
parser.read()
self.assertTrue(parser.disallow_all)
self.assertFalse(parser.can_fetch("*", file_url))
parser = urllib.robotparser.RobotFileParser()
parser.set_url(urllib.request.Request(robots_url, headers={'User-Agent': 'cybermapper'}))
parser.read()
self.assertFalse(parser.disallow_all)
self.assertTrue(parser.can_fetch("*", file_url))


@support.requires_working_socket()
class NetworkTestCase(unittest.TestCase):

Expand Down
7 changes: 6 additions & 1 deletion Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,13 @@ def modified(self):
self.last_checked = time.time()

def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
"""Sets the URL referring to a robots.txt file.
can be a string or a Request object.
"""
self.url = url

if isinstance(url, urllib.request.Request):
Comment thread
orsenthil marked this conversation as resolved.
url = url.full_url
self.host, self.path = urllib.parse.urlsplit(url)[1:3]

def read(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Let ``urllib.robotparser.RobotFileParser`` accept a ``urllib.request.Request`` object as well as a url string when setting a robots.txt url.
Loading