AI
Animation
CGI
Compression
Console
Database
Debug
Dialects
Dialogs
Editor
Email
Encryption
Extension
External Library
File
File Handling
Files
Financial
FTP
Game
Games
Graphics
GUI
HTML
HTTP
Internet
LDC
Markup
Math
Module
Network
Networking
None
Other - Net
Parse
Patch
Printing
Protocol
Rebol
Scheme
Scientific
SDK
Security
Shell
Sound
SQL
TCP
Testing
Text
Text Processing
UI
User Interface
Util
Utility
VID
Visualization
Web
Win API
X-File
XML
REBOL [
    Title: "Web Site Checker"
    Date: 11-June-2004  ;16-May-2001
    Version: 1.1.1
    File: %site-check.r
    Author: "Carl Sassenrath"
    Purpose: {Scan a web site looking for missing pages, remote links, email links, etc. Helps you clean up sites.}
    Email: carl@rebol.com
    library: [
        level: 'intermediate 
        platform: 'all 
        type: 'tool 
        domain: [web file-handling markup parse] 
        tested-under: none 
        support: none 
        license: none 
        see-also: none
    ]
]

; 1.1.1 - Fixes problem when top-level relative paths are used ( /index.html /about.html etc)

;--Config:
base-url: http://www.rebol.com
threshold: 8000   ; used to filter out huge pages
exclude-urls: [   ; URL patterns for pages to exclude
    http://www.rebol.com/library
    http://www.rebol.com/docs/core23
    http://www.rebol.com/dictionary
    http://www.rebol.com/users.html
    http://www.rebol.com/docs/dictionary
]

;--Lists:
base-str: form base-url
scanned-urls: []
missing-urls: []
remote-urls: []
local-urls: []
secure-urls: []
email-urls: []
ftp-urls: []
ref-urls: []  ; pairs of: url and referrer

;--Functions:
html?: func [url /local t] [
    all [
        t: find/last/tail url "."
        t: to-string t
        any [t = "htm" t = "html"]
    ]
]

add-url: func [urlset url from /local t] [
    clear find url "#"
    if all [
        not find url "?"
        html? url
    ][
        append urlset url
        repend ref-urls [url form from] ; second is string
    ]
]

scan-page: func [url /local tag page new path] [
    print ["Scanning:" url length? local-urls length? missing-urls]
    append scanned-urls url
    foreach u exclude-urls [if find/match url u [print "(excluded)" exit]]
    path: either html? url [first split-path url][url]
    if error? try [page: load/markup url][append missing-urls url exit]
    if (length? page) > threshold [exit] ; big page, skip it.
    foreach tag page [
        if all [
            tag? tag
            tag: parse tag "="
            tag: select tag "HREF"
        ][
            new: to-url tag
            parse/all tag [
                "#" |
                base-str  (add-url local-urls new url) |
                "/"       (add-url local-urls base-url/:new url) |  ;1.1.1
                "http:"   (append remote-urls new) |
                "https:"  (append secure-urls new) |
                "ftp:"    (append ftp-urls new) |
                "mailto:" (append email-urls new) |
                none      (add-url local-urls path/:new url)
            ]
        ]
    ]
    remote-urls: unique remote-urls
    local-urls:  unique local-urls
    secure-urls: unique secure-urls
    email-urls:  unique email-urls
    ftp-urls:    unique ftp-urls
]

;--Main code:
scan-page base-url

while [pick urls: exclude local-urls scanned-urls 1][
    scan-page pick urls 1
]

out: reform ["Site Summary for" base-url "on" now newline]

sort scanned-urls
repend out "^/Scanned Pages:^/"
foreach url scanned-urls [repend out [url newline]]

sort remote-urls
repend out "^/Remote Links:^/"
foreach url remote-urls [repend out [url newline]]

sort email-urls
repend out "^/Email Links:^/"
foreach url email-urls [repend out [url newline]]

repend out "^/References:^/"
foreach [url url2] ref-urls [repend out [url2 " -> " url newline]]

repend out "^/Missing Pages:^/"
foreach url missing-urls [
    n: ref-urls
    repend out ["Missing URL:" url newline]
    while [n: find n url] [
        repend out [tab "Ref from:" n/2 newline]
        n: next n
    ]
]

write %site-summary.txt out
browse %site-summary.txt


            
            
        
Copyright © 2018 Rebol Software Foundation