janusxr-cli/janusxr

#!/usr/bin/env -S awk -f

function usage() {
    print "Usage: ./janusxr health <room_url> [--max-time-per-asset 5] \n" \
          "       ./janusxr scrape <room_url> <outdir>\n" \
          "       ./janusxr optimize <room_url>\n"
    exit 1
}

# ------------------------------
# Core dispatcher
# ------------------------------
BEGIN {
    if (ARGC < 2) usage()
    command = ARGV[1]
    if (command == "health" || command == "scrape" || command == "optimize" ) {
        scrape(command)
    } else {
    printf("1:"ARGV[0])
    printf("1:"ARGV[1])
    printf("2:"ARGV[2])
        printf("Unknown command: %s\n", command)
        usage()
    }
}

function scrape( arg, tmpfile, line, attr, check, u) {
    url = ARGV[2]
    if (url == "") {
        print "❌ Missing URL argument."
        usage()
    }
    rooturl = url
    sub(/\/[^\/]+$/, "", rooturl)
    tmpfile = "/tmp/out.html"

    # Fetch HTML page using curl
    cmd = "curl -s \"" url "\" -o " tmpfile
    if (system(cmd) != 0) { print "❌ Failed to fetch " url; exit 1; }

    # Parse attributes from src= or url=
    while ((getline line < tmpfile) > 0) {
        while (match(line, /(src|url)="[^"]+"/)) {
            attr = substr(line, RSTART, RLENGTH)
            sub(/^[^=]+="/, "", attr)
            sub(/"$/, "", attr)
            links[attr] = 1
            line = substr(line, RSTART + RLENGTH)
        }
    }
    close(tmpfile)

    # Check each extracted links
    if( arg == "health" ){
      maxtime = ARGV[3]
      if ( maxtime == "" ) maxtime = 5
      nlinks = 0
      nlinksok = 0
      for (u in links) {
          if( substr(u,1,1) == "/" ) u = rooturl""u
          check = "curl -L --max-time "maxtime" -I -s \"" u "\" > /dev/null"
          if (system(check) == 0){
              nlinksok++
              printf("✅ %s\n", u)
          }else printf("❌ %s\n", u)
          nlinks+=1
      }
      print "⚕️ health: "(( 100/nlinks )*nlinksok)"%"
      if( nlinks != nlinksok ) exit(1)
    }

    if( arg == "scrape" ) {
      outdir = ARGV[3]
      if ( outdir == "" ) outdir = "."
      system("mkdir "outdir" || true ")
      system("cp "tmpfile" "outdir"/index.html")

      for (u in links) {
          if( substr(u,1,1) == "/" ) u = rooturl""u
          check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
          if (system(check) == 0 && has_non_html_ext(u) ){
              n = split(u, fileparts, "/")
              filepart = fileparts[n]
              outfile  = outdir"/"filepart
              system("curl -L --max-time 20 -s \"" u "\" > "outfile)
              system("sed -i 's|"u"|"filepart"|g' "outdir"/index.html")
              nlinksok++
              printf("✅ %s\n", u)
          }else printf("🔗 %s\n", u)
          nlinks+=1
      }
    }

    if( arg == "optimize" ) {
      printf("<!-- copy/paste below into your HTML/JML-file -->\n")
      for (u in links) {
          if( substr(u,1,1) == "/" ) u = rooturl""u
          check = "curl -L --max-time 20 -I -s \"" u "\" > /dev/null"
          if (system(check) == 0 && has_non_html_ext(u) ){
              printf("<a href='"u"'/>\n")
              split(u,urlpart,"/")
              domain=urlpart[1]"//"urlpart[3]
              if( !domains[domain] ){
                printf("<link rel='preconnect' href='"domain"'/>\n")
                domains[domain]=1
              }
          }
      }
    }
}

# Function: has_non_html_ext
# Returns 1 if the file has an extension after at least one slash and it is not "html"
# Returns 0 otherwise
function has_non_html_ext(file,   arr) {
    # Check for at least one slash
    if (file !~ /\//) return 0

    # Match extension after last dot that is not a slash
    if (match(file, /\.([^.\/]+)$/, arr)) {
        ext = arr[1]
        if (ext != "html") return 1
    }

    return 0
}