REBOL [ Title: "Rebol-WebBot" Date: 17-Jan-2003/19:19:09+1:00 Version: 0.1.2 File: %reb-web-bot.r Author: "Oldes" Purpose: "To download pages from some url with all content" History: [ 0.1.2 {More bug fixes, added handling generated documents (content-type test)} 0.1.1 "A lot of bug fixes + travelling cycle added" 0.1.0 {Download-page function and important functions for URLs manipulations} 0.0.1 "Just a simple page parser" ] Email: oliva.david@seznam.cz Category: [markup net util web] note: { ^-^-Author of this script uses it just for testing own pages and do not take any responsibility of using this script for other purposes! ^-^-Downloading some content available in the Internet may some people consider to be illegal! Also there is a security risk of downloading dangerous softwares as viruses and other unsecured files and storing them in your PCs so be very careful what you are going to download! ^-^-Running this script in high speed on some servers also may be considered as an attack or data stealing and may lead to banishing your IP address on these servers!} todo: [ "Non blocking file download and better View version" {Cookie handler (just to go thru all these cookies traps)} "To modify only links to already downloaded files!" {logic for downloading files (not to download already downloaded files)} {test, test and test and then fix the bugs and clear the code} {MySQL version for cooperation with more bots and large trips} ] ] reb-web-bot: make object! [ ;found links in this page: images: make block! 50 links: make block! 500 scripts: make block! 10 stylesheets: make block! 10 otherfiles: make block! 10 text-version: true http-header: make object! [ referer: none cookie: none user_agent: {Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)} ] cgi-extensions: ["php" "php3" "cfm" "asp" "r" "cgi" "pl"] stop: false root-dir: none ;%/d/downloaded-pages/ purl: this-page-url: ;file that's processed base-href: base-path-parts: none ;base url for relative paths new-html: page-markup: none x: y: url: tag-name: none ;help variables for rebuilding tags pages-to-search: make block! 500 ;links that will be processed pages-searched: make block! 500 ;links already processed file-info: none ;contains file-size, date, content-type and loaded bytes of the ;file that's downloading wanna-search-rules: make block! [] ;block with parse rules ;if none of them is true, the page will not be searched ;if it's empty, all links will be followed! ;EXAMPLE: wanna-search-rules: [[thru "test" thru ".html" end]] ; ==> all links wich contain "test" and ends with ".html" will be followed if none? find [View Link] system/product [ net-utils/url-parser/parse-url: func [ {Return url dataset or cause an error if not a valid URL} port [object! port!] url /set-scheme "Set port scheme" /no-error "Do not throw an error, return NONE instead." ] bind [ set vars none either parse/all url url-rules [ if net-watch [print ["URL Parse:" reduce vars]] if user [port/user: user] if pass [port/pass: pass] if host [port/host: host] if port-id [port/port-id: to-integer port-id] if path [port/path: path] if target [port/target: target] if all [set-scheme scheme] [port/scheme: to-word scheme] port ] [if not no-error [net-error reform ["URL error:" url]]] ] in net-utils/url-parser 'self ] decode-url: func ["Decode a URL into an object." url /local purl][ purl: context [scheme: user: pass: host: port-id: path: target: tag: none] net-utils/url-parser/tag: none net-utils/url-parser/parse-url/no-error/set-scheme purl url purl/tag: net-utils/url-parser/tag purl ] init: func[][ if none? root-dir [ print reform ["== Enter target dir ( default is: " dirize system/options/home/public ")"] root-dir: ask "==: " if empty? root-dir [ probe root-dir: dirize system/options/home/public ] root-dir: dirize to-file root-dir ] clear images clear links clear scripts clear stylesheets clear otherfiles ] set-base-href: func[page-url /quiet][ purl: decode-url page-url if none? purl/path [purl/path: copy ""] base-href: rejoin [ http:// purl/host either none? purl/port-id [""][join ":" purl/port-id] #"/" purl/path ] if not quiet [ base-path-parts: parse purl/path "/" insert base-path-parts join purl/host either none? purl/port-id [""][ join "_atport-" purl/port-id ] ] ] get-relative-path: func[url /local ident purl url-path-parts bl ul q relative-path][ ;print "======relative-path=========" probe url purl: decode-url trim url ;probe purl if none? purl [return none] if none? purl/path [purl/path: copy ""] url-path-parts: parse purl/path "/" insert url-path-parts join purl/host either none? purl/port-id [""][ join "_atport-" purl/port-id ] ;print reform [" url:" mold url-path-parts] ;print reform ["base:" mold base-path-parts] relative-path: make file! 100 bl: length? base-path-parts ul: length? url-path-parts forall base-path-parts [ i: index? base-path-parts ;print ["t:" base-path-parts/1 url-path-parts/:i] if base-path-parts/1 <> url-path-parts/:i [ q: bl - i + 1 loop q [insert relative-path %../] for j i ul 1 [ insert tail relative-path join url-path-parts/:j #"/" ] break ] ] ;print [ul bl i q] if all [ul > bl i = bl none? q] [ for j i + 1 ul 1 [ insert tail relative-path join url-path-parts/:j #"/" ] ] base-path-parts: head base-path-parts insert tail relative-path either none? purl/target [ "__default-file__.html" ][ if found? find purl/target #"?" [ replace purl/target #"?" "_query_" insert tail purl/target ".html" ] purl/target ] if not none? purl/tag [insert tail relative-path join #"#" purl/tag] ;insert tail relative-path any [purl/target "__default-file__.html"] ;print ["====" relative-path] return relative-path ] get-local-file: func[url /local tmp file ext ourl][ ;print "========local-file=======" ourl: copy url tmp: decode-url url url: rejoin [ tmp/host either none? tmp/port-id [""][join "_atport-" tmp/port-id] #"/" any [tmp/path ""] ;any [tmp/target ""] ] if not none? tmp/target [ file: first parse tmp/target "?" if found? find tmp/target #"?" [ replace tmp/target #"?" "_query_" ] ext: find/last/tail file "." if find cgi-extensions ext [ ;probe url insert tail tmp/target join "." get-content-type-ext get-content-type to-url ourl ] insert tail url tmp/target ] ;prin "====" probe url return url ] download-progress-text: func[total bytes][ loop length? mold file-info/loaded [prin "^(back)"] prin file-info/loaded: bytes return true ] download-progress: func [total bytes] [ prog/data: bytes / (max 1 total) stat/text: reform [bytes "bytes"] show [prog stat] not stop ] save-to-local: func[url /data dt /local file-path][ file-path: join root-dir get-local-file url if #"/" = last url [ insert tail file-path "__default-file__.html"] ;error? try [ either any [ not exists? file-path data ][ if not data [ dt: read-net/progress to-url url either text-version [:download-progress-text][:download-progress] ] ;if the file is not found dt is none! either none? dt [ if text-version [print " - NOT FOUND!"] ][ make-dir/deep copy/part file-path index? find/last file-path "/" write/binary file-path dt error? try [ if not none? file-info/date [ set-modes file-path [modification-date: file-info/date] ] ] if text-version [print " - SAVED"] ] ][ if text-version [ print either exists? file-path [" - ALREADY SAVED"]["error?"] ] ] ;] ] download-files: func[files][ forall files [ either text-version [ prin [length? pages-to-search "/" length? pages-searched "D:" files/1 " "] ][ downloadingfile/text: files/1 stat/text: "" show [downloadingfile stat] ] ;prin rejoin [files/1 " - 0"] ;error? try [ save-to-local files/1 ;] ;print " ...OK" ] ] build-url: func[p][ rejoin [ p/scheme "://" p/host either none? p/port-id [""][join ":" p/port-id] #"/" any [p/path ""] any [p/target ""] either none? p/tag [""][join #"#" p/tag] ] ] get-absolute-url: func [path [string! none!] /local tmp u q w new-url][ ;print "--------absolute-url=======" ;probe path if none? path [return ""] path: trim/with path {'"} original-url: copy path if find/part path "javascript:" 11 [return none] if find/part path "mailto:" 7 [return path] if path/1 = #"#" [insert path this-page-url] if path/1 = #"/" [ parse base-href [copy w thru "://" copy q [to "/" | to end]] path: rejoin [w q path] ] tmp: decode-url path if all [none? tmp/scheme none? tmp/host][ insert path base-href ] tmp: decode-url path either none? tmp/path [path: copy ""][ dirs: parse tmp/path "/" forall dirs [ switch first dirs [ ".." [dirs: back dirs remove/part dirs 2] "." [remove dirs] ] ] dirs: head dirs tmp/path: make string! 50 forall dirs [ insert tail tmp/path join first dirs #"/" ] ] ;prin "==== " path: build-url tmp ] read-net: func [ {Read a file from the net (web). Update progress bar. Allow abort.} url [url!] /progress callback {Call func [total bytes] during transfer. Return true.} /local port buffer data size ][ if error? try [ port: open/direct/binary/custom url compose/deep [header [(third http-header)]] ][ return none] size: to-integer any [port/locals/headers/content-length 8000] file-info: make object! [ content-type: port/locals/headers/content-type date: port/date size: either none? port/locals/headers/content-length [none][ to-integer port/locals/headers/content-length ] loaded: 0 ] buffer: make binary! size set-modes port/sub-port [lines: false binary: true no-wait: true] until [ if not data: wait [60 port/sub-port] [data: true break] if data: copy port/sub-port [append buffer data] all [:callback size not callback size length? buffer data: true break] not data ] close port if not data [buffer] ] forbidden-extensions: ["pif" "bat" "scr" "vbs"] forbidden-extension?: func[url /local aaaa][ if not series? url [return false] ;if error? [ ;aaaa: find forbidden-extensions to string! find/last/tail "." ;"to string!" is here just for cases where none is returned ;][prin "EEEE" probe url halt] ;aaaa ] urlchars: charset [#"a" - #"z" #"A" - #"Z" #"0" - #"9" "?/\:&;#%~+-_.*="] geturl-rule: [opt [{"} | {'}] copy url any [urlchars] opt [{"} | {'}] y: to end] tag-rules: [ "img" copy x thru {src=} geturl-rule ( tag-name: "img" url: get-absolute-url url if all [none? forbidden-extension? url none? find images url not none? url] [insert images url] ) | "body" copy x thru {background=} geturl-rule ( tag-name: "BODY" url: get-absolute-url url if all [none? forbidden-extension? url none? find images url not none? url] [insert images url] ) | "link" copy x thru {href=} geturl-rule ( tag-name: "link" url: get-absolute-url url if all [none? forbidden-extension? url none? find stylesheets url not none? url] [insert stylesheets url ] ) | "script" copy x thru {src=} geturl-rule ( tag-name: "script" url: get-absolute-url url if all [none? forbidden-extension? url none? find scripts url not none? url] [insert scripts url ] ) | "BASE" copy x thru {href=} geturl-rule ( either none? url [url: ""][ set-base-href/quiet get-absolute-url url ;print rejoin ["new base-href: " base-href] ] tag-name: "BASE" ) | "FRAME" copy x thru {src=} geturl-rule ( tag-name: "frame" url: get-absolute-url url if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ] ) | [ "EMBED" copy x thru {src=} (tag-name: "EMBED") | "PARAM NAME=movie" copy x thru {VALUE=} (tag-name: "PARAM NAME=movie") ] geturl-rule ( url: get-absolute-url url if all [none? forbidden-extension? url none? find otherfiles url not none? url] [insert otherfiles url ] ) | "a" copy x thru {href=} geturl-rule ( tag-name: "a" url: get-absolute-url url if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ] ) | "META" copy x thru {URL=} geturl-rule ( tag-name: "META" url: get-absolute-url url if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ] ) ] get-content-type: func[url /local tmp][ either error? try [ tmp: open/direct/no-wait url close tmp ][ none ][ tmp/locals/headers/content-type] ] get-content-type-ext: func[ct /local tmp][ tmp: select [ "text/plain" "txt" "text/javascript" "js" "text/html" "html" "image/gif" "gif" ] ct if none? tmp [tmp: "html"] tmp ] download-page: func[page-url /local orig-html ext tag local-url][ system/options/quiet: true if text-version [ print reform ["***DOWNLOADING:" page-url] ] insert pages-searched this-page-url: copy page-url init if error? try [ page-markup: load/markup to-url page-url ;to-string page ][ print ["ERROR!: was not able to load/markup page:" page-url] return false] ;print "--" set-base-href page-url if none? purl/target [ insert tail page-url "__default-file__.html" ;print [tab "root url:" page-url] ] new-html: make string! 10000 parse/all page-markup [ some [ set tag tag! ( if parse/all tag tag-rules [ if not none? url [ ;probe local-url: get-local-file copy url ;prin "orig:" ;probe original-url probe url ;print rejoin [{"} local-url {"}] tag: rejoin either tag-name = "BASE" [ [""] ][ local-url: get-relative-path trim copy url ["<" tag-name x {'} local-url {'} y ">"] ] ] ] insert tail new-html tag ) | set tag any-type! (insert tail new-html tag) ] ] ;probe stylesheets ;probe images ;probe scripts ;probe links download-files images download-files scripts download-files stylesheets download-files otherfiles forall links [ orig-link: copy first links if not found? find pages-searched orig-link [ link: first parse (first links) "?" link: first parse link "#" download-files to-block orig-link ext: find/last/tail link "." if find cgi-extensions ext [ ext: get-content-type-ext get-content-type to-url orig-link ] if any [ find ["html" "htm" "php"] ext ;I want to parse only documents of the HTML type! #"/" = last link ][ either empty? wanna-search-rules [ insert pages-to-search orig-link ][ foreach rule wanna-search-rules [ if parse/all orig-link rule [ insert pages-to-search orig-link break ] ] ] ] ] ] insert tail new-html rejoin [{^/}] ;probe page-url save-to-local/data page-url new-html ;print "--" system/options/quiet: false true ] start-trip: func[/pages pgs /window /local url][ text-version: not window if window [ view/new center-face lo: layout [ space 10x8 vh2 300 "Downloading File:" downloadingfile: vtext bold center 300 "" prog: progress 300 across btn 90 "Cancel" [stop: true] stat: text 160x24 middle ] ] if pages [pages-to-search: copy pgs] while [not empty? pages-to-search] [ url: first pages-to-search remove pages-to-search download-page url ] ] ] ;reb-web-bot/start-trip/pages [http://www.flumps.org/ip/c/indexc.html] ;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html] reb-web-bot/root-dir: %/d/test/ ;reb-web-bot/download-page http://127.0.0.1/cgi-bin/homes/cebus/menu.r?p=aktualne&lang=1 ;reb-web-bot/download-page http://127.0.0.1/swf/menu.html reb-web-bot/start-trip/pages [ http://127.0.0.1/] ;reb-web-bot/start-trip/pages [ http://127.0.0.1:81/index-ie.html] ;unview ;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html] ;reb-web-bot/start-trip/pages [http://abunaih.nextlevelhost.com/wife/page_01.htm] ;halt