126 lines
3.7 KiB
Awk
Executable file
126 lines
3.7 KiB
Awk
Executable file
#!/usr/bin/env -S awk -f
|
|
|
|
function usage() {
|
|
print "Usage: ./janusxr health <room_url> [--max-time-per-asset 5] \n" \
|
|
" ./janusxr scrape <room_url> <outdir>\n" \
|
|
" ./janusxr optimize <room_url>\n"
|
|
exit 1
|
|
}
|
|
|
|
# ------------------------------
|
|
# Core dispatcher
|
|
# ------------------------------
|
|
BEGIN {
|
|
if (ARGC < 2) usage()
|
|
command = ARGV[1]
|
|
if (command == "health" || command == "scrape" || command == "optimize" ) {
|
|
scrape(command)
|
|
} else {
|
|
printf("1:"ARGV[0])
|
|
printf("1:"ARGV[1])
|
|
printf("2:"ARGV[2])
|
|
printf("Unknown command: %s\n", command)
|
|
usage()
|
|
}
|
|
}
|
|
|
|
function scrape( arg, tmpfile, line, attr, check, u) {
|
|
url = ARGV[2]
|
|
if (url == "") {
|
|
print "❌ Missing URL argument."
|
|
usage()
|
|
}
|
|
rooturl = url
|
|
sub(/\/[^\/]+$/, "", rooturl)
|
|
tmpfile = "/tmp/out.html"
|
|
|
|
# Fetch HTML page using curl
|
|
cmd = "curl -s \"" url "\" -o " tmpfile
|
|
if (system(cmd) != 0) { print "❌ Failed to fetch " url; exit 1; }
|
|
|
|
# Parse attributes from src= or url=
|
|
while ((getline line < tmpfile) > 0) {
|
|
while (match(line, /(src|url)="[^"]+"/)) {
|
|
attr = substr(line, RSTART, RLENGTH)
|
|
sub(/^[^=]+="/, "", attr)
|
|
sub(/"$/, "", attr)
|
|
links[attr] = 1
|
|
line = substr(line, RSTART + RLENGTH)
|
|
}
|
|
}
|
|
close(tmpfile)
|
|
|
|
# Check each extracted links
|
|
if( arg == "health" ){
|
|
maxtime = ARGV[3]
|
|
if ( maxtime == "" ) maxtime = 5
|
|
nlinks = 0
|
|
nlinksok = 0
|
|
for (u in links) {
|
|
if( substr(u,1,1) == "/" ) u = rooturl""u
|
|
check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null"
|
|
if (system(check) == 0){
|
|
nlinksok++
|
|
printf("✅ %s\n", u)
|
|
}else printf("❌ %s\n", u)
|
|
nlinks+=1
|
|
}
|
|
print "⚕️ health: "(( 100/nlinks )*nlinksok)"%"
|
|
if( nlinks != nlinksok ) exit(1)
|
|
}
|
|
|
|
if( arg == "scrape" ) {
|
|
outdir = ARGV[3]
|
|
if ( outdir == "" ) outdir = "."
|
|
system("mkdir "outdir" || true ")
|
|
system("cp "tmpfile" "outdir"/index.html")
|
|
|
|
for (u in links) {
|
|
if( substr(u,1,1) == "/" ) u = rooturl""u
|
|
check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
|
|
if (system(check) == 0 && has_non_html_ext(u) ){
|
|
n = split(u, fileparts, "/")
|
|
filepart = fileparts[n]
|
|
outfile = outdir"/"filepart
|
|
system("curl -L --max-time 20 -s \"" u "\" > "outfile)
|
|
system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html")
|
|
nlinksok++
|
|
printf("✅ %s\n", u)
|
|
}else printf("🔗 %s\n", u)
|
|
nlinks+=1
|
|
}
|
|
}
|
|
|
|
if( arg == "optimize" ) {
|
|
printf("<!-- copy/paste below into your HTML/JML-file -->\n")
|
|
for (u in links) {
|
|
if( substr(u,1,1) == "/" ) u = rooturl""u
|
|
check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
|
|
if (system(check) == 0 && has_non_html_ext(u) ){
|
|
printf("<a href='"u"'/>\n")
|
|
split(u,urlpart,"/")
|
|
domain=urlpart[1]"//"urlpart[3]
|
|
if( !domains[domain] ){
|
|
printf("<link rel='preconnect' href='"domain"'/>\n")
|
|
domains[domain]=1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Function: has_non_html_ext
|
|
# Returns 1 if the file has an extension after at least one slash and it is not "html"
|
|
# Returns 0 otherwise
|
|
function has_non_html_ext(file, arr) {
|
|
# Check for at least one slash
|
|
if (file !~ /\//) return 0
|
|
|
|
# Match extension after last dot that is not a slash
|
|
if (match(file, /\.([^.\/]+)$/, arr)) {
|
|
ext = arr[1]
|
|
if (ext != "html") return 1
|
|
}
|
|
|
|
return 0
|
|
}
|