Rebol

Scripts

Animation
CGI
Compression
Console
Database
Debug
Dialects
Dialogs
Editor
Encryption
Extension
External Library
File
File Handling
Files
Financial
FTP
Game
Games
Graphics
GUI
HTML
HTTP
Internet
LDC
Markup
Math
Module
Network
Networking
None
Other - Net
Parse
Patch
Printing
Protocol
Rebol
Scheme
Scientific
SDK
Security
Shell
Sound
SQL
TCP
Testing
Text
Text Processing
User Interface
Util
Utility
VID
Visualization
Web
Win API
X-File
XML
Wiki

Download Script View Documentation
REBOL[
    Title: "extract-web-links"
    Version: 1.2.1
    Date: 18-04-2005
    Author: "Peter WA Wood"
    Copyright: "Peter WA Wood"
    File: %extract-web-links.r
    Purpose: {A function which scans a string (normally a web page)
              and creates a block of URL/Text combinations for each
              HTML  tag in the string.}
    Usage: { example: extract-web-links read http://www.rebol.org }
    Library: [
        level: 'beginner
        type: 'function
        domain: [web html markup]
        platform: 'all
        tested-under: [ core 2.5.6.2.4 "Mac OS X 10.2.8"
                        core 2.5.6.3.1 "Windows XP Professional"
                        view 1.2.10.3.1 "Windows XP Professiona"]
        support: none
        license: 'cc-by 
	 {see http://www.rebol.org/cgi-bin/cgiwrap/rebol/license-help.r}
    ]
]

extract-web-links: func [
     {scans a string (normally a web page)
     and returns a block of URL (url!) / Text (string!) combinations for         
     each HTML  tag in the string}
    web-page [string!]
        "The string from which to extract web links"
    /only-urls
        "Only URLs are returned in the block"
    /only-descriptions
        "Only the supplied descriptions of the links are returned"
    /local
	    result-block
	        "Block for harvesting URLs and descriptions"
    	collected-url
            "Used to harvest individual URLs"
        collected-desc
            "Used to harvest individual descriptions"
        end-pos
            "end position of selection of web-page"
][

it: [                       ; the main processing of the function

    do initialisation

    until [                     ; end of web-page is reached
    
                                            ;Get the next URL in web-page
        web-page: find/tail web-page " taqs too
					
        if web-page [                       ;  tag found
            if (not only-descriptions) [    ; URLs requested
                do harvest-url              ; Harvest the URL

                either collected-url        ; Add any URL to result block
                
                   [append result-block to-URL collected-url]
                   
                   [break]                  ; skip to next url

             ]    
	
             if not only-urls [             ; Descriptions wanted ?
                 do harvest-desc	    
                 append result-block collected-desc
             ]                       
        ]  
  
        web-page = none	                    ; test for end of web-page
    ] ; end until

    return result-block

] ; end it


;;=======================================================================

initialisation: [
    result-block: make block! []            ; initialise result block

;   If both "only" refinements are set, turn them off.
;     Using both refinements has the same effect as using neither.
;     This allows the  remainder of the code to treat the refinements as 
;     being mutually exclusive.      

    if all [only-urls only-descriptions] [
        only-urls: none
        only-descriptions: none
    ]

] ; end intialisation

;;=======================================================================

harvest-url: [             ; section to harvest URL

    collected-url: copy ""

    web-page:  find/tail web-page "href="   ; move to char after href=
   
    either web-page [                       ; check href present

                        ; Find start of URl
                                            ; href may be full or relative URL 
                                            ; Skip opening quote if full URL

        if (first web-page) = #"^""
             [web-page: next web-page]

        end-pos: find web-page ">"          ; find end of  tag                                    
         
        either end-pos [                    ; end of  tag found
        
	        collected-url: copy/part web-page end-pos
            if (last collected-url) = #"^"" ; remove trailing quote
            
                [collected-url: head remove back tail collected-url]
                
        ][                  ; no closing > for  tag !!
            collected-url: none
        ]

    ][                      ; no href !!!
        collected-url: none  
    ]

] ; end harvest-url

;;======================================================================

harvest-desc: [	        ; section to harvest description 
    collected-desc: copy ""
    web-page: find web-page ">"         ; move to end of  tag
    if web-page [
        web-page: next web-page         ; move past >

        if find web-page "" [       ; look for closing tag

            end-pos: find web-page ""   ; set end-pos at <
            
						;check for img tag
            either find/part web-page "")]][
        collected-desc: find collected-desc "<"
        remove remove/part collected-desc find collected-desc ">" 
         
    ]

collected-desc: head collected-desc		;  set index at start

] ; strip-embedded-tags

;;========================================================================


do it				; execute the function code
 
] ; end extract-web-links

;; History
;; 
;; 1.1.0 23-Dec-2004    Initial release
;; 1.2.0 16-Jan-2004    Usage, copyright added to header
;; 1.2.1 19-Apr-2004    Tested under View 1.2.10.3.1
;;
;;