#!/usr/bin/env -S awk -f function usage() { print "Usage: ./janusxr health [--max-time-per-asset 5] \n" \ " ./janusxr scrape \n" \ " ./janusxr optimize \n" exit 1 } # ------------------------------ # Core dispatcher # ------------------------------ BEGIN { if (ARGC < 2) usage() command = ARGV[1] if (command == "health" || command == "scrape" || command == "optimize" ) { scrape(command) } else { printf("1:"ARGV[0]) printf("1:"ARGV[1]) printf("2:"ARGV[2]) printf("Unknown command: %s\n", command) usage() } } function scrape( arg, tmpfile, line, attr, check, u) { url = ARGV[2] if (url == "") { print "❌ Missing URL argument." usage() } rooturl = url sub(/\/[^\/]+$/, "", rooturl) tmpfile = "/tmp/out.html" # Fetch HTML page using curl cmd = "curl -s \"" url "\" -o " tmpfile if (system(cmd) != 0) { print "❌ Failed to fetch " url; exit 1; } # Parse attributes from src= or url= while ((getline line < tmpfile) > 0) { while (match(line, /(src|url)="[^"]+"/)) { attr = substr(line, RSTART, RLENGTH) sub(/^[^=]+="/, "", attr) sub(/"$/, "", attr) links[attr] = 1 line = substr(line, RSTART + RLENGTH) } } close(tmpfile) # Check each extracted links if( arg == "health" ){ maxtime = ARGV[3] if ( maxtime == "" ) maxtime = 5 nlinks = 0 nlinksok = 0 for (u in links) { if( substr(u,1,1) == "/" ) u = rooturl""u check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null" if (system(check) == 0){ nlinksok++ printf("✅ %s\n", u) }else printf("❌ %s\n", u) nlinks+=1 } print "⚕️ health: "(( 100/nlinks )*nlinksok)"%" if( nlinks != nlinksok ) exit(1) } if( arg == "scrape" ) { outdir = ARGV[3] if ( outdir == "" ) outdir = "." system("mkdir "outdir" || true ") system("cp "tmpfile" "outdir"/index.html") for (u in links) { if( substr(u,1,1) == "/" ) u = rooturl""u check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null" if (system(check) == 0 && has_non_html_ext(u) ){ n = split(u, fileparts, "/") filepart = fileparts[n] outfile = outdir"/"filepart system("curl -L --max-time 20 -s \"" u "\" > "outfile) system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html") nlinksok++ printf("✅ %s\n", u) }else printf("🔗 %s\n", u) nlinks+=1 } } if( arg == "optimize" ) { printf("\n") for (u in links) { if( substr(u,1,1) == "/" ) u = rooturl""u check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null" if (system(check) == 0 && has_non_html_ext(u) ){ printf("\n") split(u,urlpart,"/") domain=urlpart[1]"//"urlpart[3] if( !domains[domain] ){ printf("\n") domains[domain]=1 } } } } } # Function: has_non_html_ext # Returns 1 if the file has an extension after at least one slash and it is not "html" # Returns 0 otherwise function has_non_html_ext(file, arr) { # Check for at least one slash if (file !~ /\//) return 0 # Match extension after last dot that is not a slash if (match(file, /\.([^.\/]+)$/, arr)) { ext = arr[1] if (ext != "html") return 1 } return 0 }