From 832ab6e2f4ae3d92fa2b8cfa78012c28c18a76c2 Mon Sep 17 00:00:00 2001 From: Leon van Kammen Date: Tue, 11 Nov 2025 23:21:21 +0100 Subject: [PATCH] > scrape a room URL and rewrite JML to serve local assets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ```bash $ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir 🔗 http://dizzket.com/archive/dotmatrix/ 🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox ✅ http://www.janusvr.com/newlobby/scripts/home.txt ... $ ls -la mydir index.html home.txt ... ``` --- README.md | 26 ++++++++++++++++--- janusxr | 74 +++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index ca3d106..40d3625 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,17 @@ Portable swiss-army knife to automate [janusXR](https://janusxr.org) / JML thing > *What is [janusXR](https://janusxr.org)?* It's an open, user-operated immersive web layer, open-sourced by the JanusVR company, that seamlessly lives within websites, even 12 years later thanks to [janusweb](https://github.com/jbaicoianu/janusweb). It provides a highly viable and easy-to-adopt ecosystem of portals, enabling immersive experiences that challenge the traditional app store paradigm. Get started with [this guide](https://madjin.github.io/janus-guide/#/). - # Usage +``` +Usage: ./janusxr --health [--max-time-per-asset 5] + ./janusxr --scrape +``` +## Examples + > scan a room URL for broken links in JML+HTML -``` +```bash $ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs ✅ http://localhost:8791/templates/xrfragment/%232/website.glb @@ -28,7 +33,22 @@ $ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs ``` -# Awk? +> scrape a room URL and rewrite JML to serve local assets + +```bash +$ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir +🔗 http://dizzket.com/archive/dotmatrix/ +🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox +✅ http://www.janusvr.com/newlobby/scripts/home.txt +... + +$ ls -la mydir +index.html +home.txt +... +``` + +## Awk? Why not some superfancy scripting for this task? diff --git a/janusxr b/janusxr index 837825f..1751a1b 100755 --- a/janusxr +++ b/janusxr @@ -1,7 +1,8 @@ #!/usr/bin/env -S awk -f function usage() { - print "Usage: ./janusxr --health \n" + print "Usage: ./janusxr --health [--max-time-per-asset 5] \n" \ + " ./janusxr --scrape \n" exit 1 } @@ -11,15 +12,15 @@ function usage() { BEGIN { if (ARGC < 2) usage() command = ARGV[1] - if (command == "--health") { - health() + if (command == "--health" || command == "--scrape") { + scrape(command) } else { printf("Unknown command: %s\n", command) usage() } } -function health( tmpfile, line, attr, check, u) { +function scrape( arg, tmpfile, line, attr, check, u) { url = ARGV[2] if (url == "") { print "❌ Missing URL argument." @@ -46,18 +47,59 @@ function health( tmpfile, line, attr, check, u) { close(tmpfile) # Check each extracted links - nlinks = 0 - nlinksok = 0 - for (u in links) { - if( substr(u,1,1) == "/" ) u = rooturl""u - check = "curl -I -s \"" u "\" > /dev/null" - if (system(check) == 0){ - nlinksok++ - printf("✅ %s\n", u) - }else printf("❌ %s\n", u) - nlinks+=1 + if( arg == "--health" ){ + maxtime = ARGV[3] + if ( maxtime == "" ) maxtime = 5 + nlinks = 0 + nlinksok = 0 + for (u in links) { + if( substr(u,1,1) == "/" ) u = rooturl""u + check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null" + if (system(check) == 0){ + nlinksok++ + printf("✅ %s\n", u) + }else printf("❌ %s\n", u) + nlinks+=1 + } + print "⚕️ health: "(( 100/nlinks )*nlinksok)"%" + if( nlinks != nlinksok ) exit(1) + } + + if( arg == "--scrape" ) { + outdir = ARGV[3] + if ( outdir == "" ) outdir = "." + system("mkdir "outdir" || true ") + system("cp "tmpfile" "outdir"/index.html") + + for (u in links) { + if( substr(u,1,1) == "/" ) u = rooturl""u + check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null" + if (system(check) == 0 && has_non_html_ext(u) ){ + n = split(u, fileparts, "/") + filepart = fileparts[n] + outfile = outdir"/"filepart + system("curl -L --max-time 20 -s \"" u "\" > "outfile) + system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html") + nlinksok++ + printf("✅ %s\n", u) + }else printf("🔗 %s\n", u) + nlinks+=1 + } } - print "⚕️ health: "(( 100/nlinks )*nlinksok)"%" - if( nlinks != nlinksok ) exit(1) } +# Function: has_non_html_ext +# Returns 1 if the file has an extension after at least one slash and it is not "html" +# Returns 0 otherwise +function has_non_html_ext(file, arr) { + # Check for at least one slash + if (file !~ /\//) return 0 + + # Match extension after last dot that is not a slash + if (match(file, /\.([^.\/]+)$/, arr)) { + ext = arr[1] + if (ext != "html") return 1 + } + + return 0 +}