REBOL [
    Title: "Rebol-WebBot"
    Date: 17-Jan-2003/19:19:09+1:00
    Version: 0.1.2
    File: %reb-web-bot.r
    Author: "Oldes"
    Purpose: "To download pages from some url with all content"
    History: [
    0.1.2 {More bug fixes, added handling generated documents (content-type test)} 
    0.1.1 "A lot of bug fixes + travelling cycle added" 
    0.1.0 {Download-page function and important functions for URLs manipulations} 
    0.0.1 "Just a simple page parser"
]
    Email: oliva.david@seznam.cz
    Category: [markup net util web]
    note: {
^-^-Author of this script uses it just for testing own pages and do not take any responsibility of using this script for other purposes!
^-^-Downloading some content available in the Internet may some people consider to be illegal! Also there is a security risk of downloading dangerous softwares as viruses and other unsecured files and storing them in your PCs so be very careful what you are going to download!
^-^-Running this script in high speed on some servers also may be considered as an attack or data stealing and may lead to banishing your IP address on these servers!}
    todo: [
    "Non blocking file download and better View version" 
    {Cookie handler (just to go thru all these cookies traps)} 
    "To modify only links to already downloaded files!" 
    {logic for downloading files (not to download already downloaded files)} 
    {test, test and test and then fix the bugs and clear the code} 
    {MySQL version for cooperation with more bots and large trips}
]
]

reb-web-bot: make object! [
    ;found links in this page:
        images: make block! 50
        links: make block! 500
        scripts: make block! 10
        stylesheets: make block! 10
        otherfiles: make block! 10

    text-version: true
    
    http-header: make object! [
        referer: none
        cookie: none
        user_agent: {Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)}
    ]
    cgi-extensions: ["php" "php3" "cfm" "asp" "r" "cgi" "pl"]
    stop: false
    root-dir: none ;%/d/downloaded-pages/
    purl: this-page-url:    ;file that's processed
    base-href: base-path-parts: none ;base url for relative paths
    new-html: page-markup: none
    x: y: url: tag-name: none   ;help variables for rebuilding tags
    pages-to-search: make block! 500    ;links that will be processed
    pages-searched:  make block! 500    ;links already processed
    file-info: none ;contains file-size, date, content-type and loaded bytes of the
                    ;file that's downloading

    wanna-search-rules: make block! [] ;block with parse rules
    ;if none of them is true, the page will not be searched
    ;if it's empty, all links will be followed!
    ;EXAMPLE: wanna-search-rules: [[thru "test" thru ".html" end]]
    ;  ==> all links wich contain "test" and ends with ".html" will be followed
        
    if none? find [View Link] system/product [
        net-utils/url-parser/parse-url: func [
            {Return url dataset or cause an error if not a valid URL}
            port [object! port!]
            url
            /set-scheme "Set port scheme"
            /no-error "Do not throw an error, return NONE instead."
        ] bind [
            set vars none
            either parse/all url url-rules [
                if net-watch [print ["URL Parse:" reduce vars]]
                if user [port/user: user]
                if pass [port/pass: pass]
                if host [port/host: host]
                if port-id [port/port-id: to-integer port-id]
                if path [port/path: path]
                if target [port/target: target]
                if all [set-scheme scheme] [port/scheme: to-word scheme]
                port
            ] [if not no-error [net-error reform ["URL error:" url]]]
        ] in net-utils/url-parser 'self
    ]   
    decode-url: func ["Decode a URL into an object." url /local purl][
        purl: context [scheme: user: pass: host: port-id: path: target: tag: none]
        net-utils/url-parser/tag: none
        net-utils/url-parser/parse-url/no-error/set-scheme purl url
        purl/tag: net-utils/url-parser/tag
        purl
    ]
    
    init: func[][
        if none? root-dir [
            print reform ["== Enter target dir ( default is: " dirize system/options/home/public ")"]
            root-dir: ask "==: "
            if empty? root-dir [
                probe root-dir: dirize system/options/home/public
            ]
            root-dir: dirize to-file root-dir
        ]
        clear images
        clear links
        clear scripts
        clear stylesheets
        clear otherfiles
    ]   
    
    set-base-href: func[page-url /quiet][
        purl: decode-url page-url
        if none? purl/path [purl/path: copy ""]
        base-href: rejoin [
            http:// purl/host
            either none? purl/port-id [""][join ":" purl/port-id]
            #"/" purl/path
        ]
        if not quiet [
            base-path-parts: parse purl/path "/"
            insert base-path-parts join purl/host either none? purl/port-id [""][
                join "_atport-" purl/port-id
            ]
        ]
    ]
    get-relative-path: func[url /local ident purl url-path-parts bl ul q relative-path][ 
        ;print "======relative-path=========" probe url
        purl: decode-url trim url
        ;probe purl
        if none? purl [return none]
        if none? purl/path [purl/path: copy ""]
        url-path-parts: parse purl/path "/"
        insert url-path-parts join purl/host either none? purl/port-id [""][
            join "_atport-" purl/port-id
        ]
        ;print reform [" url:" mold url-path-parts]
        ;print reform ["base:" mold base-path-parts]
        relative-path: make file! 100
        bl: length? base-path-parts
        ul: length? url-path-parts
        forall base-path-parts [
            i: index? base-path-parts
            ;print ["t:" base-path-parts/1 url-path-parts/:i]
            if base-path-parts/1 <> url-path-parts/:i [
                q: bl - i + 1
                loop q [insert relative-path %../]
                
                for j i ul 1 [
                    insert tail relative-path join url-path-parts/:j #"/"
                ]
                break
            ]
        ]
        ;print [ul bl i q]
        if all [ul > bl i = bl none? q] [
            for j i + 1 ul 1 [
                insert tail relative-path join url-path-parts/:j #"/"
            ]
        ]
        base-path-parts: head base-path-parts
        insert tail relative-path either none? purl/target [
            "__default-file__.html"
        ][
            if found? find purl/target #"?" [
                replace purl/target #"?" "_query_"
                insert tail purl/target ".html"
            ]
            purl/target
        ]
        if not none? purl/tag [insert tail relative-path join #"#" purl/tag]
        ;insert tail relative-path any [purl/target "__default-file__.html"]
        ;print ["====" relative-path]
        return relative-path 
    ]

    get-local-file: func[url /local tmp file ext ourl][
        ;print "========local-file======="
        ourl: copy url
        tmp: decode-url url
        url: rejoin [
            tmp/host
            either none? tmp/port-id [""][join "_atport-" tmp/port-id]
            #"/" any [tmp/path ""] ;any [tmp/target ""]
        ]
        if not none? tmp/target [
            file: first parse tmp/target "?"
            if found? find tmp/target #"?" [
                replace tmp/target #"?" "_query_"
            ]
            ext: find/last/tail file "."
            if find cgi-extensions ext [
                ;probe url
                insert tail tmp/target join "." get-content-type-ext get-content-type to-url ourl
            ]
            insert tail url tmp/target
        ]
        ;prin "===="        probe url
        return url
    ]

    download-progress-text: func[total bytes][
        loop length? mold file-info/loaded [prin "^(back)"]
        prin file-info/loaded: bytes
        return true
    ]
    download-progress: func [total bytes] [
        prog/data: bytes / (max 1 total)
        stat/text: reform [bytes "bytes"]
        show [prog stat]
        not stop
    ]
    save-to-local: func[url /data dt /local file-path][
        file-path: join root-dir get-local-file url
        if #"/" = last url [ insert tail file-path "__default-file__.html"]
        ;error? try [
            either any [
                not exists? file-path
                data
            ][
                if not data [
                    dt: read-net/progress to-url url either text-version [:download-progress-text][:download-progress]
                ]
                ;if the file is not found dt is none!
                either none? dt [
                    if text-version [print " - NOT FOUND!"]
                ][
                    make-dir/deep copy/part file-path index? find/last file-path "/"
                    write/binary file-path dt
                    error? try [
                        if not none? file-info/date [
                            set-modes file-path [modification-date: file-info/date]
                        ]
                    ]
                    if text-version [print " - SAVED"]
                ]
            ][
                if text-version [
                    print either exists? file-path [" - ALREADY SAVED"]["error?"]
                ]
            ]
        ;]
    ]

    download-files: func[files][
        forall files [
            either text-version [
                prin [length? pages-to-search "/" length? pages-searched "D:" files/1 " "]
            ][
                downloadingfile/text: files/1
                stat/text: ""
                show [downloadingfile stat]
            ]
            ;prin rejoin [files/1 " - 0"]
            ;error? try [
                save-to-local files/1
            ;]
            ;print " ...OK"
        ]
    ]
    build-url: func[p][
        rejoin [
            p/scheme "://" p/host
            either none? p/port-id [""][join ":" p/port-id]
            #"/" any [p/path ""] any [p/target ""]
            either none? p/tag [""][join #"#" p/tag]
        ]
    ]
    get-absolute-url: func [path [string! none!] /local tmp u q w new-url][
    ;print "--------absolute-url======="
        ;probe path
        if none? path [return ""]
        path: trim/with path {'"}
        original-url: copy path
        if find/part path "javascript:" 11 [return none] 
        if find/part path "mailto:" 7 [return path]
        if path/1 = #"#" [insert path this-page-url]
        if path/1 = #"/" [
            parse base-href [copy w thru "://" copy q [to "/" | to end]]
            path: rejoin [w q path]
        ]
        tmp: decode-url path
        if all [none? tmp/scheme none? tmp/host][
            insert path base-href
        ]
        tmp: decode-url path
        either none? tmp/path [path: copy ""][
            dirs: parse tmp/path "/"
            forall dirs [
                switch first dirs [
                    ".." [dirs: back dirs remove/part dirs 2]
                    "."  [remove dirs]
                ]
            ]
            dirs: head dirs
            tmp/path: make string! 50
            forall dirs [
                insert tail tmp/path join first dirs #"/"
            ]
        ]
        ;prin "==== "
        path: build-url tmp
    ]       
    
    read-net: func [
        {Read a file from the net (web). Update progress bar. Allow abort.}
        url [url!]
        /progress callback {Call func [total bytes] during transfer. Return true.}
        /local port buffer data size
    ][
        if error? try [
            port: open/direct/binary/custom url compose/deep [header [(third http-header)]]
        ][  return none]
        size: to-integer any [port/locals/headers/content-length 8000]
        file-info: make object! [
            content-type: port/locals/headers/content-type
            date: port/date
            size: either none? port/locals/headers/content-length [none][
                to-integer port/locals/headers/content-length
            ]
            loaded: 0
        ]
        buffer: make binary! size
        set-modes port/sub-port [lines: false binary: true no-wait: true]
        until [
            if not data: wait [60 port/sub-port] [data: true break]
            if data: copy port/sub-port [append buffer data]
            all [:callback size not callback size length? buffer data: true break]
            not data
        ]
        close port
        if not data [buffer]
    ]
    forbidden-extensions: ["pif"  "bat" "scr" "vbs"]
    forbidden-extension?: func[url /local aaaa][
        if not series? url [return false]
        ;if error? [
            ;aaaa:
             find forbidden-extensions to string! find/last/tail "."
             ;"to string!" is here just for cases where none is returned
        ;][prin "EEEE" probe url halt]
        ;aaaa
    ]
    urlchars: charset [#"a" - #"z" #"A" - #"Z" #"0" - #"9" "?/\:&;#%~+-_.*="]
    geturl-rule: [opt [{"} | {'}] copy url any [urlchars] opt [{"} | {'}] y: to end]
    tag-rules: [
        "img" copy x thru {src=} geturl-rule (
            tag-name: "img"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find images url not none? url] [insert images url]
        )
        | "body" copy x thru {background=} geturl-rule (
            tag-name: "BODY"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find images url not none? url] [insert images url]
        )
        | "link" copy x thru {href=} geturl-rule (
            tag-name: "link"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find stylesheets url not none? url] [insert stylesheets url ]
        )
        | "script" copy x thru {src=} geturl-rule (
            tag-name: "script"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find scripts url not none? url] [insert scripts url ]          
        )
        | "BASE" copy x thru {href=} geturl-rule (
            either none? url [url: ""][
                set-base-href/quiet get-absolute-url url
                ;print rejoin ["new base-href: " base-href]
            ]
            tag-name: "BASE"
        )
        | "FRAME" copy x thru {src=} geturl-rule (
            tag-name: "frame"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
        )
        | [   "EMBED" copy x thru {src=} (tag-name: "EMBED")
            | "PARAM NAME=movie" copy x thru {VALUE=} (tag-name: "PARAM NAME=movie")
          ] geturl-rule (
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find otherfiles url not none? url] [insert otherfiles url ]
        )
        | "a" copy x thru {href=} geturl-rule (
            tag-name: "a"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
        )
        | "META" copy x thru {URL=} geturl-rule (
            tag-name: "META"
            url: get-absolute-url url
            if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
        )
    ]
    
    get-content-type: func[url /local tmp][
        either error? try [
            tmp: open/direct/no-wait url
            close tmp
        ][  none ][ tmp/locals/headers/content-type]
    ]
    get-content-type-ext: func[ct /local tmp][
        tmp: select [
            "text/plain"        "txt"
            "text/javascript"   "js"
            "text/html"         "html"
            "image/gif"         "gif"
        ] ct
        if none? tmp [tmp: "html"]
        tmp
    ]
    download-page: func[page-url /local orig-html ext tag local-url][
        system/options/quiet: true
        if text-version [
            print reform ["***DOWNLOADING:" page-url]
        ]
        insert pages-searched this-page-url: copy page-url
        init
        if error? try [
            page-markup: load/markup to-url page-url ;to-string page
        ][ print ["ERROR!: was not able to load/markup page:" page-url]  return false]
        ;print "--"
        set-base-href page-url
        if none? purl/target [
            insert tail page-url "__default-file__.html"
            ;print [tab "root url:" page-url]
        ]
        new-html: make string! 10000
        parse/all page-markup [
            some [
                set tag tag! (
                    if parse/all tag tag-rules [
                        if not none? url [
                            ;probe local-url: get-local-file copy url
                            ;prin "orig:"
                            ;probe original-url probe url
                            ;print rejoin [{"} local-url {"}]
                            tag: rejoin either tag-name = "BASE" [
                                ["<!BASE" x {'} url {'} y ">"]
                            ][
                                local-url: get-relative-path trim copy url
                                ["<" tag-name x {'} local-url {'} y ">"]
                            ]
                        ]
                    ]
                    insert tail new-html tag
                )
                | set tag any-type! (insert tail new-html tag)
            ]
        ]
        ;probe stylesheets
        ;probe images
        ;probe scripts
        ;probe links

        download-files images
        download-files scripts
        download-files stylesheets
        download-files otherfiles
        forall links [
            orig-link: copy first links
            if not found? find pages-searched orig-link [
                link: first parse (first links) "?"
                link: first parse link "#"
                download-files to-block orig-link
                ext: find/last/tail link "."
                if find cgi-extensions ext [
                    ext: get-content-type-ext  get-content-type to-url orig-link
                ]
                if any [
                    find ["html" "htm" "php"] ext ;I want to parse only documents of the HTML type!
                    #"/" = last link
                ][
                    either empty? wanna-search-rules [
                        insert pages-to-search orig-link
                    ][
                        foreach rule wanna-search-rules [
                            if parse/all orig-link rule [
                                insert pages-to-search orig-link
                                break
                            ]
                        ]
                    ]
                ]
            ]
        ]
        insert tail new-html rejoin [{^/<!-- saved by Oldes' Rebol-WebBot: } now { -->}]
        ;probe page-url
        save-to-local/data page-url new-html
        ;print "--"
        system/options/quiet: false
        true
    ]
    start-trip: func/pages pgs /window /local url][
        text-version: not window
        if window [
            view/new center-face lo: layout [
                space 10x8
                vh2 300 "Downloading File:"
                downloadingfile: vtext bold center 300 ""
                prog: progress 300
                across
                btn 90 "Cancel" [stop: true]
                stat: text 160x24 middle
            ]
        ]
        if pages [pages-to-search: copy pgs]
        while [not empty? pages-to-search] [
            url: first pages-to-search
            remove pages-to-search
            download-page url
            
        ]
    ]
]

;reb-web-bot/start-trip/pages [http://www.flumps.org/ip/c/indexc.html]
;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html]
reb-web-bot/root-dir: %/d/test/
;reb-web-bot/download-page  http://127.0.0.1/cgi-bin/homes/cebus/menu.r?p=aktualne&lang=1
;reb-web-bot/download-page http://127.0.0.1/swf/menu.html
reb-web-bot/start-trip/pages [ http://127.0.0.1/]
;reb-web-bot/start-trip/pages [ http://127.0.0.1:81/index-ie.html]
;unview
;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html]

;reb-web-bot/start-trip/pages [http://abunaih.nextlevelhost.com/wife/page_01.htm]

;halt