> scrape a room URL and rewrite JML to serve local assets

```bash
$ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir
🔗 http://dizzket.com/archive/dotmatrix/
🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox
 http://www.janusvr.com/newlobby/scripts/home.txt
...

$ ls -la mydir
index.html
home.txt
...
```
This commit is contained in:
Leon van Kammen 2025-11-11 23:21:21 +01:00
parent fde184608f
commit 832ab6e2f4
2 changed files with 81 additions and 19 deletions

View file

@ -4,12 +4,17 @@ Portable swiss-army knife to automate [janusXR](https://janusxr.org) / JML thing
> *What is [janusXR](https://janusxr.org)?* It's an open, user-operated immersive web layer, open-sourced by the JanusVR company, that seamlessly lives within websites, even 12 years later thanks to [janusweb](https://github.com/jbaicoianu/janusweb). It provides a highly viable and easy-to-adopt ecosystem of portals, enabling immersive experiences that challenge the traditional app store paradigm. Get started with [this guide](https://madjin.github.io/janus-guide/#/).
# Usage
```
Usage: ./janusxr --health <room_url> [--max-time-per-asset 5]
./janusxr --scrape <room_url> <outdir>
```
## Examples
> scan a room URL for broken links in JML+HTML
```
```bash
$ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs
✅ http://localhost:8791/templates/xrfragment/%232/website.glb
@ -28,7 +33,22 @@ $ ./janusxr --health http://localhost:8790/models/m5gr26w0wqqs
```
# Awk?
> scrape a room URL and rewrite JML to serve local assets
```bash
$ ./janusxr --scrape https://www.janusxr.org/newlobby/index.html mydir
🔗 http://dizzket.com/archive/dotmatrix/
🔗 https://vesta.janusvr.com/nazrin/minecraft-sandbox
✅ http://www.janusvr.com/newlobby/scripts/home.txt
...
$ ls -la mydir
index.html
home.txt
...
```
## Awk?
Why not some superfancy scripting for this task?

52
janusxr
View file

@ -1,7 +1,8 @@
#!/usr/bin/env -S awk -f
function usage() {
print "Usage: ./janusxr --health <room_url> \n"
print "Usage: ./janusxr --health <room_url> [--max-time-per-asset 5] \n" \
" ./janusxr --scrape <room_url> <outdir>\n"
exit 1
}
@ -11,15 +12,15 @@ function usage() {
BEGIN {
if (ARGC < 2) usage()
command = ARGV[1]
if (command == "--health") {
health()
if (command == "--health" || command == "--scrape") {
scrape(command)
} else {
printf("Unknown command: %s\n", command)
usage()
}
}
function health( tmpfile, line, attr, check, u) {
function scrape( arg, tmpfile, line, attr, check, u) {
url = ARGV[2]
if (url == "") {
print "❌ Missing URL argument."
@ -46,11 +47,14 @@ function health( tmpfile, line, attr, check, u) {
close(tmpfile)
# Check each extracted links
if( arg == "--health" ){
maxtime = ARGV[3]
if ( maxtime == "" ) maxtime = 5
nlinks = 0
nlinksok = 0
for (u in links) {
if( substr(u,1,1) == "/" ) u = rooturl""u
check = "curl -I -s \"" u "\" > /dev/null"
check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null"
if (system(check) == 0){
nlinksok++
printf("✅ %s\n", u)
@ -61,3 +65,41 @@ function health( tmpfile, line, attr, check, u) {
if( nlinks != nlinksok ) exit(1)
}
if( arg == "--scrape" ) {
outdir = ARGV[3]
if ( outdir == "" ) outdir = "."
system("mkdir "outdir" || true ")
system("cp "tmpfile" "outdir"/index.html")
for (u in links) {
if( substr(u,1,1) == "/" ) u = rooturl""u
check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
if (system(check) == 0 && has_non_html_ext(u) ){
n = split(u, fileparts, "/")
filepart = fileparts[n]
outfile = outdir"/"filepart
system("curl -L --max-time 20 -s \"" u "\" > "outfile)
system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html")
nlinksok++
printf("✅ %s\n", u)
}else printf("🔗 %s\n", u)
nlinks+=1
}
}
}
# Function: has_non_html_ext
# Returns 1 if the file has an extension after at least one slash and it is not "html"
# Returns 0 otherwise
function has_non_html_ext(file, arr) {
# Check for at least one slash
if (file !~ /\//) return 0
# Match extension after last dot that is not a slash
if (match(file, /\.([^.\/]+)$/, arr)) {
ext = arr[1]
if (ext != "html") return 1
}
return 0
}