REBOL [
Title: "Rebol-WebBot"
Date: 17-Jan-2003/19:19:09+1:00
Version: 0.1.2
File: %reb-web-bot.r
Author: "Oldes"
Purpose: "To download pages from some url with all content"
History: [
0.1.2 {More bug fixes, added handling generated documents (content-type test)}
0.1.1 "A lot of bug fixes + travelling cycle added"
0.1.0 {Download-page function and important functions for URLs manipulations}
0.0.1 "Just a simple page parser"
]
Email: oliva.david@seznam.cz
Category: [markup net util web]
note: {
^-^-Author of this script uses it just for testing own pages and do not take any responsibility of using this script for other purposes!
^-^-Downloading some content available in the Internet may some people consider to be illegal! Also there is a security risk of downloading dangerous softwares as viruses and other unsecured files and storing them in your PCs so be very careful what you are going to download!
^-^-Running this script in high speed on some servers also may be considered as an attack or data stealing and may lead to banishing your IP address on these servers!}
todo: [
"Non blocking file download and better View version"
{Cookie handler (just to go thru all these cookies traps)}
"To modify only links to already downloaded files!"
{logic for downloading files (not to download already downloaded files)}
{test, test and test and then fix the bugs and clear the code}
{MySQL version for cooperation with more bots and large trips}
]
]
reb-web-bot: make object! [
;found links in this page:
images: make block! 50
links: make block! 500
scripts: make block! 10
stylesheets: make block! 10
otherfiles: make block! 10
text-version: true
http-header: make object! [
referer: none
cookie: none
user_agent: {Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)}
]
cgi-extensions: ["php" "php3" "cfm" "asp" "r" "cgi" "pl"]
stop: false
root-dir: none ;%/d/downloaded-pages/
purl: this-page-url: ;file that's processed
base-href: base-path-parts: none ;base url for relative paths
new-html: page-markup: none
x: y: url: tag-name: none ;help variables for rebuilding tags
pages-to-search: make block! 500 ;links that will be processed
pages-searched: make block! 500 ;links already processed
file-info: none ;contains file-size, date, content-type and loaded bytes of the
;file that's downloading
wanna-search-rules: make block! [] ;block with parse rules
;if none of them is true, the page will not be searched
;if it's empty, all links will be followed!
;EXAMPLE: wanna-search-rules: [[thru "test" thru ".html" end]]
; ==> all links wich contain "test" and ends with ".html" will be followed
if none? find [View Link] system/product [
net-utils/url-parser/parse-url: func [
{Return url dataset or cause an error if not a valid URL}
port [object! port!]
url
/set-scheme "Set port scheme"
/no-error "Do not throw an error, return NONE instead."
] bind [
set vars none
either parse/all url url-rules [
if net-watch [print ["URL Parse:" reduce vars]]
if user [port/user: user]
if pass [port/pass: pass]
if host [port/host: host]
if port-id [port/port-id: to-integer port-id]
if path [port/path: path]
if target [port/target: target]
if all [set-scheme scheme] [port/scheme: to-word scheme]
port
] [if not no-error [net-error reform ["URL error:" url]]]
] in net-utils/url-parser 'self
]
decode-url: func ["Decode a URL into an object." url /local purl][
purl: context [scheme: user: pass: host: port-id: path: target: tag: none]
net-utils/url-parser/tag: none
net-utils/url-parser/parse-url/no-error/set-scheme purl url
purl/tag: net-utils/url-parser/tag
purl
]
init: func[][
if none? root-dir [
print reform ["== Enter target dir ( default is: " dirize system/options/home/public ")"]
root-dir: ask "==: "
if empty? root-dir [
probe root-dir: dirize system/options/home/public
]
root-dir: dirize to-file root-dir
]
clear images
clear links
clear scripts
clear stylesheets
clear otherfiles
]
set-base-href: func[page-url /quiet][
purl: decode-url page-url
if none? purl/path [purl/path: copy ""]
base-href: rejoin [
http:// purl/host
either none? purl/port-id [""][join ":" purl/port-id]
#"/" purl/path
]
if not quiet [
base-path-parts: parse purl/path "/"
insert base-path-parts join purl/host either none? purl/port-id [""][
join "_atport-" purl/port-id
]
]
]
get-relative-path: func[url /local ident purl url-path-parts bl ul q relative-path][
;print "======relative-path=========" probe url
purl: decode-url trim url
;probe purl
if none? purl [return none]
if none? purl/path [purl/path: copy ""]
url-path-parts: parse purl/path "/"
insert url-path-parts join purl/host either none? purl/port-id [""][
join "_atport-" purl/port-id
]
;print reform [" url:" mold url-path-parts]
;print reform ["base:" mold base-path-parts]
relative-path: make file! 100
bl: length? base-path-parts
ul: length? url-path-parts
forall base-path-parts [
i: index? base-path-parts
;print ["t:" base-path-parts/1 url-path-parts/:i]
if base-path-parts/1 <> url-path-parts/:i [
q: bl - i + 1
loop q [insert relative-path %../]
for j i ul 1 [
insert tail relative-path join url-path-parts/:j #"/"
]
break
]
]
;print [ul bl i q]
if all [ul > bl i = bl none? q] [
for j i + 1 ul 1 [
insert tail relative-path join url-path-parts/:j #"/"
]
]
base-path-parts: head base-path-parts
insert tail relative-path either none? purl/target [
"__default-file__.html"
][
if found? find purl/target #"?" [
replace purl/target #"?" "_query_"
insert tail purl/target ".html"
]
purl/target
]
if not none? purl/tag [insert tail relative-path join #"#" purl/tag]
;insert tail relative-path any [purl/target "__default-file__.html"]
;print ["====" relative-path]
return relative-path
]
get-local-file: func[url /local tmp file ext ourl][
;print "========local-file======="
ourl: copy url
tmp: decode-url url
url: rejoin [
tmp/host
either none? tmp/port-id [""][join "_atport-" tmp/port-id]
#"/" any [tmp/path ""] ;any [tmp/target ""]
]
if not none? tmp/target [
file: first parse tmp/target "?"
if found? find tmp/target #"?" [
replace tmp/target #"?" "_query_"
]
ext: find/last/tail file "."
if find cgi-extensions ext [
;probe url
insert tail tmp/target join "." get-content-type-ext get-content-type to-url ourl
]
insert tail url tmp/target
]
;prin "====" probe url
return url
]
download-progress-text: func[total bytes][
loop length? mold file-info/loaded [prin "^(back)"]
prin file-info/loaded: bytes
return true
]
download-progress: func [total bytes] [
prog/data: bytes / (max 1 total)
stat/text: reform [bytes "bytes"]
show [prog stat]
not stop
]
save-to-local: func[url /data dt /local file-path][
file-path: join root-dir get-local-file url
if #"/" = last url [ insert tail file-path "__default-file__.html"]
;error? try [
either any [
not exists? file-path
data
][
if not data [
dt: read-net/progress to-url url either text-version [:download-progress-text][:download-progress]
]
;if the file is not found dt is none!
either none? dt [
if text-version [print " - NOT FOUND!"]
][
make-dir/deep copy/part file-path index? find/last file-path "/"
write/binary file-path dt
error? try [
if not none? file-info/date [
set-modes file-path [modification-date: file-info/date]
]
]
if text-version [print " - SAVED"]
]
][
if text-version [
print either exists? file-path [" - ALREADY SAVED"]["error?"]
]
]
;]
]
download-files: func[files][
forall files [
either text-version [
prin [length? pages-to-search "/" length? pages-searched "D:" files/1 " "]
][
downloadingfile/text: files/1
stat/text: ""
show [downloadingfile stat]
]
;prin rejoin [files/1 " - 0"]
;error? try [
save-to-local files/1
;]
;print " ...OK"
]
]
build-url: func[p][
rejoin [
p/scheme "://" p/host
either none? p/port-id [""][join ":" p/port-id]
#"/" any [p/path ""] any [p/target ""]
either none? p/tag [""][join #"#" p/tag]
]
]
get-absolute-url: func [path [string! none!] /local tmp u q w new-url][
;print "--------absolute-url======="
;probe path
if none? path [return ""]
path: trim/with path {'"}
original-url: copy path
if find/part path "javascript:" 11 [return none]
if find/part path "mailto:" 7 [return path]
if path/1 = #"#" [insert path this-page-url]
if path/1 = #"/" [
parse base-href [copy w thru "://" copy q [to "/" | to end]]
path: rejoin [w q path]
]
tmp: decode-url path
if all [none? tmp/scheme none? tmp/host][
insert path base-href
]
tmp: decode-url path
either none? tmp/path [path: copy ""][
dirs: parse tmp/path "/"
forall dirs [
switch first dirs [
".." [dirs: back dirs remove/part dirs 2]
"." [remove dirs]
]
]
dirs: head dirs
tmp/path: make string! 50
forall dirs [
insert tail tmp/path join first dirs #"/"
]
]
;prin "==== "
path: build-url tmp
]
read-net: func [
{Read a file from the net (web). Update progress bar. Allow abort.}
url [url!]
/progress callback {Call func [total bytes] during transfer. Return true.}
/local port buffer data size
][
if error? try [
port: open/direct/binary/custom url compose/deep [header [(third http-header)]]
][ return none]
size: to-integer any [port/locals/headers/content-length 8000]
file-info: make object! [
content-type: port/locals/headers/content-type
date: port/date
size: either none? port/locals/headers/content-length [none][
to-integer port/locals/headers/content-length
]
loaded: 0
]
buffer: make binary! size
set-modes port/sub-port [lines: false binary: true no-wait: true]
until [
if not data: wait [60 port/sub-port] [data: true break]
if data: copy port/sub-port [append buffer data]
all [:callback size not callback size length? buffer data: true break]
not data
]
close port
if not data [buffer]
]
forbidden-extensions: ["pif" "bat" "scr" "vbs"]
forbidden-extension?: func[url /local aaaa][
if not series? url [return false]
;if error? [
;aaaa:
find forbidden-extensions to string! find/last/tail "."
;"to string!" is here just for cases where none is returned
;][prin "EEEE" probe url halt]
;aaaa
]
urlchars: charset [#"a" - #"z" #"A" - #"Z" #"0" - #"9" "?/\:&;#%~+-_.*="]
geturl-rule: [opt [{"} | {'}] copy url any [urlchars] opt [{"} | {'}] y: to end]
tag-rules: [
"img" copy x thru {src=} geturl-rule (
tag-name: "img"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find images url not none? url] [insert images url]
)
| "body" copy x thru {background=} geturl-rule (
tag-name: "BODY"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find images url not none? url] [insert images url]
)
| "link" copy x thru {href=} geturl-rule (
tag-name: "link"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find stylesheets url not none? url] [insert stylesheets url ]
)
| "script" copy x thru {src=} geturl-rule (
tag-name: "script"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find scripts url not none? url] [insert scripts url ]
)
| "BASE" copy x thru {href=} geturl-rule (
either none? url [url: ""][
set-base-href/quiet get-absolute-url url
;print rejoin ["new base-href: " base-href]
]
tag-name: "BASE"
)
| "FRAME" copy x thru {src=} geturl-rule (
tag-name: "frame"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
)
| [ "EMBED" copy x thru {src=} (tag-name: "EMBED")
| "PARAM NAME=movie" copy x thru {VALUE=} (tag-name: "PARAM NAME=movie")
] geturl-rule (
url: get-absolute-url url
if all [none? forbidden-extension? url none? find otherfiles url not none? url] [insert otherfiles url ]
)
| "a" copy x thru {href=} geturl-rule (
tag-name: "a"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
)
| "META" copy x thru {URL=} geturl-rule (
tag-name: "META"
url: get-absolute-url url
if all [none? forbidden-extension? url none? find links url not none? url] [insert links url ]
)
]
get-content-type: func[url /local tmp][
either error? try [
tmp: open/direct/no-wait url
close tmp
][ none ][ tmp/locals/headers/content-type]
]
get-content-type-ext: func[ct /local tmp][
tmp: select [
"text/plain" "txt"
"text/javascript" "js"
"text/html" "html"
"image/gif" "gif"
] ct
if none? tmp [tmp: "html"]
tmp
]
download-page: func[page-url /local orig-html ext tag local-url][
system/options/quiet: true
if text-version [
print reform ["***DOWNLOADING:" page-url]
]
insert pages-searched this-page-url: copy page-url
init
if error? try [
page-markup: load/markup to-url page-url ;to-string page
][ print ["ERROR!: was not able to load/markup page:" page-url] return false]
;print "--"
set-base-href page-url
if none? purl/target [
insert tail page-url "__default-file__.html"
;print [tab "root url:" page-url]
]
new-html: make string! 10000
parse/all page-markup [
some [
set tag tag! (
if parse/all tag tag-rules [
if not none? url [
;probe local-url: get-local-file copy url
;prin "orig:"
;probe original-url probe url
;print rejoin [{"} local-url {"}]
tag: rejoin either tag-name = "BASE" [
["<!BASE" x {'} url {'} y ">"]
][
local-url: get-relative-path trim copy url
["<" tag-name x {'} local-url {'} y ">"]
]
]
]
insert tail new-html tag
)
| set tag any-type! (insert tail new-html tag)
]
]
;probe stylesheets
;probe images
;probe scripts
;probe links
download-files images
download-files scripts
download-files stylesheets
download-files otherfiles
forall links [
orig-link: copy first links
if not found? find pages-searched orig-link [
link: first parse (first links) "?"
link: first parse link "#"
download-files to-block orig-link
ext: find/last/tail link "."
if find cgi-extensions ext [
ext: get-content-type-ext get-content-type to-url orig-link
]
if any [
find ["html" "htm" "php"] ext ;I want to parse only documents of the HTML type!
#"/" = last link
][
either empty? wanna-search-rules [
insert pages-to-search orig-link
][
foreach rule wanna-search-rules [
if parse/all orig-link rule [
insert pages-to-search orig-link
break
]
]
]
]
]
]
insert tail new-html rejoin [{^/<!-- saved by Oldes' Rebol-WebBot: } now { -->}]
;probe page-url
save-to-local/data page-url new-html
;print "--"
system/options/quiet: false
true
]
start-trip: func/pages pgs /window /local url][
text-version: not window
if window [
view/new center-face lo: layout [
space 10x8
vh2 300 "Downloading File:"
downloadingfile: vtext bold center 300 ""
prog: progress 300
across
btn 90 "Cancel" [stop: true]
stat: text 160x24 middle
]
]
if pages [pages-to-search: copy pgs]
while [not empty? pages-to-search] [
url: first pages-to-search
remove pages-to-search
download-page url
]
]
]
;reb-web-bot/start-trip/pages [http://www.flumps.org/ip/c/indexc.html]
;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html]
reb-web-bot/root-dir: %/d/test/
;reb-web-bot/download-page http://127.0.0.1/cgi-bin/homes/cebus/menu.r?p=aktualne&lang=1
;reb-web-bot/download-page http://127.0.0.1/swf/menu.html
reb-web-bot/start-trip/pages [ http://127.0.0.1/]
;reb-web-bot/start-trip/pages [ http://127.0.0.1:81/index-ie.html]
;unview
;reb-web-bot/start-trip/pages [http://127.0.0.1:85/dowtest/test/test2.html]
;reb-web-bot/start-trip/pages [http://abunaih.nextlevelhost.com/wife/page_01.htm]
;halt