3 \"@s\dZddlZddlZddlZdgZejddZGdddZGdddZ Gd d d Z dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt NRobotFileParser RequestRatezrequests secondsc@sjeZdZdZdddZddZddZd d Zd d Zd dZ ddZ ddZ ddZ ddZ ddZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. cCs,g|_d|_d|_d|_|j|d|_dS)NFr)entries default_entry disallow_all allow_allset_url last_checked)selfurlr #/usr/lib64/python3.6/robotparser.py__init__s  zRobotFileParser.__init__cCs|jS)zReturns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )r )r r r rmtime$szRobotFileParser.mtimecCsddl}|j|_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)timer )r rr r rmodified-szRobotFileParser.modifiedcCs&||_tjj|dd\|_|_dS)z,Sets the URL referring to a robots.txt file.N)r urllibparseurlparsehostpath)r r r r rr 5szRobotFileParser.set_urlcCsytjj|j}WnRtjjk rd}z2|jdkr:d|_n|jdkrT|jdkrTd|_WYdd}~XnX|j }|j |j dj dS) z4Reads the robots.txt URL and feeds it to the parser.TiiNzutf-8)rr) rZrequestZurlopenr errorZ HTTPErrorcoderrreadrdecode splitlines)r ferrrawr r rr:s zRobotFileParser.readcCs,d|jkr|jdkr(||_n |jj|dS)N*) useragentsrrappend)r entryr r r _add_entryGs  zRobotFileParser._add_entrycCs6d}t}|jx|D]}|sT|dkr8t}d}n|dkrT|j|t}d}|jd}|dkrr|d|}|j}|sq|jdd}t|dkr|djj|d<tj j |dj|d<|ddkr|dkr|j|t}|j j |dd}q|ddkr4|dkr|j j t|dd d}q|dd krh|dkr|j j t|dd d}q|dd kr|dkr|djjrt|d|_d}q|dd kr|dkr|djd}t|dkr|djjr|djjrtt|dt|d|_d}qW|dkr2|j|dS)zParse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rr#N:z user-agentZdisallowFZallowTz crawl-delayz request-rate/)Entryrr(findstripsplitlenlowerrrunquoter%r& rulelinesRuleLineisdigitintdelayrreq_rate)r linesstater'lineiZnumbersr r rrPsd             zRobotFileParser.parsecCs|jr dS|jrdS|jsdStjjtjj|}tjjdd|j|j |j |j f}tjj |}|sfd}x"|j D]}|j|rn|j|SqnW|jr|jj|SdS)z=using the parsed robots.txt decide if useragent can fetch urlFTrr,)rrr rrrr3 urlunparserZparamsZqueryZfragmentquoter applies_to allowancer)r useragentr Z parsed_urlr'r r r can_fetchs$    zRobotFileParser.can_fetchcCs4|js dSx|jD]}|j|r|jSqW|jjS)N)rrr@r8r)r rBr'r r r crawl_delays    zRobotFileParser.crawl_delaycCs4|js dSx|jD]}|j|r|jSqW|jjS)N)rrr@r9r)r rBr'r r r request_rates    zRobotFileParser.request_ratecCs0|j}|jdk r||jg}djtt|dS)N )rrjoinmapstr)r rr r r__str__s  zRobotFileParser.__str__N)r)__name__ __module__ __qualname____doc__rrrr rr(rrCrDrErJr r r rrs    Cc@s(eZdZdZddZddZddZdS) r5zoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.cCs>|dkr| rd}tjjtjj|}tjj||_||_dS)NrT)rrr>rr?rrA)r rrAr r rrs zRuleLine.__init__cCs|jdkp|j|jS)Nr$)r startswith)r filenamer r rr@szRuleLine.applies_tocCs|jr dndd|jS)NZAllowZDisallowz: )rAr)r r r rrJszRuleLine.__str__N)rKrLrMrNrr@rJr r r rr5sr5c@s0eZdZdZddZddZddZdd Zd S) r-z?An entry has one or more user-agents and zero or more rulelinescCsg|_g|_d|_d|_dS)N)r%r4r8r9)r r r rrszEntry.__init__cCsg}x|jD]}|jd|q W|jdk r@|jd|j|jdk rj|j}|jd|jd|j|jtt|j |jddj |S)Nz User-agent: z Crawl-delay: zRequest-rate: r,rrF) r%r&r8r9ZrequestsZsecondsextendrHrIr4rG)r ZretagentZrater r rrJs    z Entry.__str__cCsF|jddj}x.|jD]$}|dkr*dS|j}||krdSqWdS)z2check if this entry applies to the specified agentr,rr$TF)r0r2r%)r rBrRr r rr@s zEntry.applies_tocCs$x|jD]}|j|r|jSqWdS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)r4r@rA)r rPr<r r rrAs   zEntry.allowanceN)rKrLrMrNrrJr@rAr r r rr-s   r-) rN collectionsZ urllib.parserZurllib.request__all__ namedtuplerrr5r-r r r r s 2