Rebol

Scripts

Animation
CGI
Compression
Console
Database
Debug
Dialects
Dialogs
Editor
Encryption
Extension
External Library
File
File Handling
Files
Financial
FTP
Game
Games
Graphics
GUI
HTML
HTTP
Internet
LDC
Markup
Math
Module
Network
Networking
None
Other - Net
Parse
Patch
Printing
Protocol
Rebol
Scheme
Scientific
SDK
Security
Shell
Sound
SQL
TCP
Testing
Text
Text Processing
User Interface
Util
Utility
VID
Visualization
Web
Win API
X-File
XML
Wiki

Download Script View Documentation
#!/usr/local/bin/rebol -cs

REBOL [
  Title: "Server object handler script"
  File: %server.r
  Author: [ "HY" ]
  Purpose: { This script will parse robot.txt files and store
             a hash! of forbidden paths. This is very useful for
             webbots or spiders of any kind (at least if obeying
             robot exclusion standards is desirable).

             I looked at the ht://dig package (http://www.htdig.org/)
             script that does the same thing as this script to see if I
             had overlooked something, and thereby fixed a few bugs.
           }
  Date: 16-Aug-2003
  Examples: {
     politebot: server "PoliteBot"
     politebot/add reduce ["www.netscape.com" read http://www.netscape.com/robots.txt
                   "www.google.com" read http://www.google.com/robots.txt ]
     print politebot/forbidden? http://www.google.com/search?q=rebol&ie=UTF-8&oe=UTF-8&hl=no
     print politebot/forbidden? http://www.netscape.com/
  }
  History: [
             19-Oct-2004 {Removed a bug that caused this path: /rebol/index.html
                          to be forbidden if robots.txt had Disallow: /i
                          I should do something about port numbers in URLs as well.}
             23-Sep-2004 {Added the library header}
             22-Apr-2004 {Removed a bug that modified an incoming variable
                          (from other scripts) directly.}
           ]
  Library: [
    level: 'intermediate
    domain: [parse http text-processing web]
    license: none
    Platform: 'all
    Tested-under: none
    Type: [module]
    Support: none
  ]
]

server-handler-object: context [

  ; 'context should automatically pick up set-words
  ; and bind them to the new local context, so:
  bot-name: ""
  patterns: make hash! copy []

  add: func [ block [block!] /local seen-my-name? pay-attention? my-patterns] [

    foreach [ host robots-file ] block [
      seen-my-name?: false
      pay-attention?: false
      my-patterns: copy []

      if none? robots-file [robots-file: ""]

      lines: parse/all robots-file "^/"

      forall lines [
        line: first lines
        if hash: find line "#" [ line: head remove/part find line "#" length? hash ]

        if all [0 < length? line #"#" = first line] [
          line: ""
        ]

        parsed: parse/all line ":"
        if not 0 = length? parsed [

          name: trim first parsed
          either 1 < length? parsed [
            rest: trim first next parsed
          ] [
            rest: ""
          ]


          if all [name = "user-agent"] [
            either all [rest = "*" not seen-my-name?] [
              pay-attention?: true
            ] [
              either rest = bot-name [
                either not seen-my-name? [
                  seen-my-name?: true
                  pay-attention?: true
                  clear my-patterns ; ignore previous messages for "*"
                ] [
                  pay-attention?: false ; only take first section with our name
                ]
              ] [
                pay-attention?: false ; none of our business
              ] ; end either rest = bot-name
            ] ; end either rest is * and not seen my name
          ] ; end name = user-agent

          if all [pay-attention? name = "disallow"] [
            if not 0 = length? rest [
              append my-patterns rest
            ]
          ] ; end pay attention and name is disallow

        ] ; end length? parsed not 0

      ] ; end forall lines

      append patterns reduce [host my-patterns]
    ]

  ] ; end add

  got-robots-file?: func [
                      "Tells whether or not we have the robots.txt file for a given host"
                      host [string!] "The hostname to look for"
                    ] [
    find patterns host
  ]

  forbidden?: func [
               "Tells whether or not a given URL is forbidden by the robots.txt file"
               in-url [object! string! url!]
               {The URL to check. This must be a complete URL. If this is an
               object, it is assumed to be a url-handler object
               (http://www.oops-as.no/roy/rebol-scripts/url-handler.r).
              }
                    ] [

    url: copy in-url ; don't modify incoming variable!

    if url? url [url: to-string url ]

    if string? url [

      if "" = url [return false] ; this might be just a bit too quick ...

      ; assume this is a complete url:
      url: url-handler url

    ]

    ; allowing objects, but assuming they are url-handler objects
    ; (http://www.rebol.org/cgi-bin/cgiwrap/rebol/view-script.r?color=yes&script=url-handler.r)
    path: find/match url/plain join url/protocol url/host

    pattern: select patterns url/host
    if none? pattern [return false] ; Maybe load robots.txt instead, to check for real?

    foreach p pattern [
      if found? find/part path p length? p [
        return true
      ]
    ]

    false

  ]

]

; shortcut:
server: func [
               "Shorthand for make server-handler-object [ bot-name: name ]"
               name [string!
               "The bot's name. This is the name that we will look for in the robots.txt files."]
             ] [

  return make server-handler-object [ bot-name: name ]

]