> scrape a room URL and rewrite JML to serve local assets

```bash $ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir 🔗 http://dizzket.com/archive/dotmatrix/ 🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox ✅ http://www.janusvr.com/newlobby/scripts/home.txt ... $ ls -la mydir index.html home.txt ... ```
2025-11-11 23:21:21 +01:00 · 2025-11-11 23:21:21 +01:00 · 832ab6e2f4
commit 832ab6e2f4
parent fde184608f
2 changed files with 81 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -4,12 +4,17 @@ Portable swiss-army knife to automate [janusXR](https://janusxr.org) / JML thing

 > *What is [janusXR](https://janusxr.org)?* It's an open, user-operated immersive web layer, open-sourced by the JanusVR company, that seamlessly lives within websites, even 12 years later thanks to [janusweb](https://github.com/jbaicoianu/janusweb). It provides a highly viable and easy-to-adopt ecosystem of portals, enabling immersive experiences that challenge the traditional app store paradigm. Get started with [this guide](https://madjin.github.io/janus-guide/#/).

-
 # Usage

+```
+Usage: ./janusxr --health <room_url> [--max-time-per-asset 5] 
+       ./janusxr --scrape <room_url> <outdir>
+```
+## Examples
+
 > scan a room URL for broken links in JML+HTML

-```
+```bash
 $ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs

 ✅ http://localhost:8791/templates/xrfragment/%232/website.glb
@ -28,7 +33,22 @@ $ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs

 ```

-# Awk?
+> scrape a room URL and rewrite JML to serve local assets
+
+```bash
+$ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir
+🔗 http://dizzket.com/archive/dotmatrix/
+🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox
+✅ http://www.janusvr.com/newlobby/scripts/home.txt
+...
+
+$ ls -la mydir
+index.html
+home.txt
+...
+```
+
+## Awk?

 Why not some superfancy scripting for this task?

--- a/52
+++ b/52
@ -1,7 +1,8 @@
 #!/usr/bin/env -S awk -f

 function usage() {
-    print "Usage: ./janusxr --health <room_url> \n"
+    print "Usage: ./janusxr --health <room_url> [--max-time-per-asset 5] \n" \
+          "       ./janusxr --scrape <room_url> <outdir>\n"
    exit 1
 }

@ -11,15 +12,15 @@ function usage() {
 BEGIN {
    if (ARGC < 2) usage()
    command = ARGV[1]
-    if (command == "--health") {
-        health()
+    if (command == "--health" || command == "--scrape") {
+        scrape(command)
    } else {
        printf("Unknown command: %s\n", command)
        usage()
    }
 }

-function health( tmpfile, line, attr, check, u) {
+function scrape( arg, tmpfile, line, attr, check, u) {
    url = ARGV[2]
    if (url == "") {
        print "❌ Missing URL argument."
@ -46,11 +47,14 @@ function health( tmpfile, line, attr, check, u) {
    close(tmpfile)

    # Check each extracted links
+    if( arg == "--health" ){
+      maxtime = ARGV[3]
+      if ( maxtime == "" ) maxtime = 5
      nlinks = 0
      nlinksok = 0
      for (u in links) {
          if( substr(u,1,1) == "/" ) u = rooturl""u
-        check = "curl -I -s \"" u "\" > /dev/null"
+          check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null"
          if (system(check) == 0){
              nlinksok++
              printf("✅ %s\n", u)
@ -61,3 +65,41 @@ function health( tmpfile, line, attr, check, u) {
      if( nlinks != nlinksok ) exit(1)
    }

+    if( arg == "--scrape" ) {
+      outdir = ARGV[3]
+      if ( outdir == "" ) outdir = "."
+      system("mkdir "outdir" || true ")
+      system("cp "tmpfile" "outdir"/index.html")
+
+      for (u in links) {
+          if( substr(u,1,1) == "/" ) u = rooturl""u
+          check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
+          if (system(check) == 0 && has_non_html_ext(u) ){
+              n = split(u, fileparts, "/")
+              filepart = fileparts[n]
+              outfile  = outdir"/"filepart
+              system("curl -L --max-time 20 -s \"" u "\" > "outfile)
+              system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html")
+              nlinksok++
+              printf("✅ %s\n", u)
+          }else printf("🔗 %s\n", u)
+          nlinks+=1
+      }
+    }
+}
+
+# Function: has_non_html_ext
+# Returns 1 if the file has an extension after at least one slash and it is not "html"
+# Returns 0 otherwise
+function has_non_html_ext(file,   arr) {
+    # Check for at least one slash
+    if (file !~ /\//) return 0
+
+    # Match extension after last dot that is not a slash
+    if (match(file, /\.([^.\/]+)$/, arr)) {
+        ext = arr[1]
+        if (ext != "html") return 1
+    }
+
+    return 0
+}