Every developer should be cautiously paranoid about code from the wild. For every code repository I do a quick security check before running it on a fresh Virtual Machine. I’ve assembled some of my various checks then used an AI code assist to make it look decent on the output.

#!/bin/sh
# repo_malware_scan.sh — POSIX-compatible Node repo heuristic scanner (with extra signatures)
# Usage: sh repo_malware_scan.sh /path/to/repo [--include-node-modules]

set -eu

REPO="${1:-.}"
INCLUDE_NODE_MODULES="${2:-}"

# Colors (no-op if not a TTY)
if [ -t 1 ]; then
  RED="$(printf '\033[0;31m')" ; YEL="$(printf '\033[0;33m')" ; GRN="$(printf '\033[0;32m')" ; CLR="$(printf '\033[0m')"
else
  RED="" ; YEL="" ; GRN="" ; CLR=""
fi
log() { printf "%s[INFO]%s %s\n" "$GRN" "$CLR" "$*"; }
warn() { printf "%s[WARN]%s %s\n" "$YEL" "$CLR" "$*"; }
err() { printf "%s[ALERT]%s %s\n" "$RED" "$CLR" "$*"; }

echo "${GRN}== Repo quick malware/obfuscation heuristic scan ==${CLR}"
echo "Target: $REPO"
[ "$INCLUDE_NODE_MODULES" = "--include-node-modules" ] && echo "Include node_modules: true" || echo "Include node_modules: false"
echo

# Build a portable find for extensions (BSD/GNU find compatible)
# Extensions: js,cjs,mjs,ts,tsx,jsx,html,htm,json,sh
find_files() {
  if [ "$INCLUDE_NODE_MODULES" = "--include-node-modules" ]; then
    find "$REPO" -type f \( \
      -name "*.js" -o -name "*.cjs" -o -name "*.mjs" -o \
      -name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \
      -name "*.html" -o -name "*.htm" -o -name "*.json" -o -name "*.sh" \
    \)
  else
    find "$REPO" -type f ! -path "*/node_modules/*" \( \
      -name "*.js" -o -name "*.cjs" -o -name "*.mjs" -o \
      -name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \
      -name "*.html" -o -name "*.htm" -o -name "*.json" -o -name "*.sh" \
    \)
  fi
}

##############################################################################
# 1) Pattern scanning (no PCRE needed; use -E). Edit or extend below freely. #
##############################################################################
echo "${GRN}1) Pattern-based matches (eval, new Function, exec, atob, etc.)${CLR}"
FOUND_ANY=0
# Read patterns from a here-document (one per line)
PATTERNS=$(cat <<'EOF'
eval\(
new[[:space:]]+Function\(
Function\(
setTimeout\([^,]+["']
setInterval\([^,]+["']
document\.write\(
window\[['"][A-Za-z0-9_]+['"]\]
atob\(
unescape\(
fromCharCode\(
toString\(\)\.split\(
constructor\]
require\(['"]child_process['"]\)
execFile\(|execSync\(|spawnSync\(
process\.env\[
Buffer\.from\(.{50,}\)
EOF
)

# Loop files once per pattern (simpler, portable)
echo "$PATTERNS" | while IFS= read -r p; do
  [ -z "$p" ] && continue
  MATCHED=0
  find_files | while IFS= read -r f; do
    # -nH not portable everywhere; emulate with printf
    if grep -n -E "$p" "$f" >/dev/null 2>&1; then
      if [ $MATCHED -eq 0 ]; then
        printf "%s-- Pattern: '%s'%s\n" "$YEL" "$p" "$CLR"
        MATCHED=1
      fi
      # Print matching lines with filename prefix
      grep -n -E "$p" "$f" | sed "s#^#    $f:#"
      FOUND_ANY=1
    fi
  done
done
[ "$FOUND_ANY" -eq 0 ] && echo "  No obvious pattern matches found."
echo

#########################################################
# 2) Minified / single-line style (very long line > 600)#
#########################################################
echo "${GRN}2) Minified / single-line files (very long lines)${CLR}"
MINIFIED_FOUND=0
find_files | while IFS= read -r f; do
  # Only check text files
  if command -v file >/dev/null 2>&1 && file --mime "$f" 2>/dev/null | grep -q text; then
    if awk 'length($0) > 600 {print FILENAME ":" NR ":" length($0); exit 0}' "$f" 2>/dev/null | grep -q ":"; then
      awk 'length($0) > 600 {print FILENAME ":" NR ":" length($0); exit 0}' "$f" 2>/dev/null | sed "s#^#    #"
      echo "    -> $f seems minified or obfuscated (very long lines)."
      MINIFIED_FOUND=1
    fi
  fi
done
[ "$MINIFIED_FOUND" -eq 0 ] && echo "  No minified-style single-line files detected."
echo

################################
# 3) Long base64-like strings  #
################################
echo "${GRN}3) Long base64-like continuous blobs (possible embedded payloads)${CLR}"
B64_FOUND=0
B64_REGEX='[A-Za-z0-9+/]{120,}={0,2}'
find_files | while IFS= read -r f; do
  if grep -n -E "$B64_REGEX" "$f" >/dev/null 2>&1; then
    printf "%s-- Potential base64 blob in %s%s\n" "$YEL" "$f" "$CLR"
    grep -n -E "$B64_REGEX" "$f" | sed 's/^/    /'
    B64_FOUND=1
  fi
done
[ "$B64_FOUND" -eq 0 ] && echo "  No very long base64-like blobs found."
echo

########################################
# 4) High-entropy lines (obfuscation)  #
########################################
echo "${GRN}4) High-entropy lines detection (possible obfuscation)${CLR}"
ENTROPY_FOUND=0
# Entropy threshold *100 (e.g., 450 = 4.50 bits/char)
ENTROPY_THRESHOLD=450
find_files | while IFS= read -r f; do
  # Skip tiny files (<1KB) for speed
  # Use portable size retrieval
  SIZE=0
  if command -v stat >/dev/null 2>&1; then
    # Try GNU, then BSD stat
    SIZE=$( (stat -c%s "$f" 2>/dev/null || stat -f%z "$f" 2>/dev/null || echo 0) )
  fi
  [ "$SIZE" -lt 1024 ] && continue

  awk -v threshold="$ENTROPY_THRESHOLD" '{
    n = length($0)
    if (n < 40) next
    delete c; e = 0
    for (i=1;i<=n;i++){ ch=substr($0,i,1); c[ch]++ }
    for (k in c){ p=c[k]/n; e+= -p*(log(p)/log(2)) }
    E = int(e*100)
    if (E >= threshold) {
      printf("    %s:%d:entropy=%d:length=%d\n", FILENAME, NR, E, n)
      found=1
    }
  } END { if (found) exit 0; else exit 0 }' "$f" && :
  # We can’t set a shell var inside awk portably; just mark if any line printed:
  if awk -v threshold="$ENTROPY_THRESHOLD" '{
    n = length($0); if (n < 40) next
    delete c; e = 0
    for (i=1;i<=n;i++){ ch=substr($0,i,1); c[ch]++ }
    for (k in c){ p=c[k]/n; e+= -p*(log(p)/log(2)) }
    if (int(e*100) >= threshold) { print "x"; exit 0 }
  }' "$f" | grep -q x; then
    ENTROPY_FOUND=1
  fi
done
if [ "$ENTROPY_FOUND" -eq 0 ]; then
  echo "  No high-entropy lines detected above threshold ${ENTROPY_THRESHOLD}."
else
  echo "  (High entropy lines are heuristics for obfuscation; manually inspect flagged files above.)"
fi
echo

########################################################
# 5) package.json suspicious lifecycle & dependencies  #
########################################################
echo "${GRN}5) package.json checks (suspicious scripts / odd deps)${CLR}"
PKG="$REPO/package.json"
if [ -f "$PKG" ]; then
  echo "  Showing potential lifecycle scripts (rough grep):"
  sed -n '1,200p' "$PKG" | grep -n '"scripts"' -n -A 20 | sed 's/^/    /' || true
  for s in postinstall install preinstall prepare; do
    if grep -q "\"$s\"" "$PKG"; then
      warn "  package.json defines '$s' script. Review what runs during install."
    fi
  done
  if grep -E -i '"(dependencies|devDependencies)"' -n "$PKG" >/dev/null 2>&1; then
    if grep -E -i '"(cryptominer|mine|malware|backdoor|stealer|rat|trojan)"' "$PKG" >/dev/null 2>&1; then
      warn "  Dependency names include suspicious keywords; inspect package.json."
    fi
  fi
else
  echo "  No package.json found at repo root."
fi
echo

########################################################
# 6) Summarize results (pre-extra-signatures check)   #
########################################################
echo "${GRN}== Preliminary results ==${CLR}"
if [ "$FOUND_ANY" -eq 1 ] || [ "$MINIFIED_FOUND" -eq 1 ] || [ "$B64_FOUND" -eq 1 ] || [ "$ENTROPY_FOUND" -eq 1 ]; then
  err "Potential issues found in earlier heuristics. Extra signature checks will run next."
else
  log "No obvious issues detected by heuristics so far. Proceeding with extra signature checks."
fi
echo

########################################################
# 7) Extra signature heuristics (constructor, IIFE etc)#
########################################################
echo "${GRN}7) Extra signature heuristics (constructor trick, suspicious globals/IIFE, obfuscator shufflers, new Function, long gibberish)${CLR}"
EXTRA_FOUND=0

# 7.a constructor property access patterns (e.g. obj['constructor'] or ['constructor'])
# quote carefully to allow single/double quote variants
grep -RniE '\[\s*["'"'"']constructor["'"'"']\s*\]' "$REPO" -- 2>/dev/null | sed 's/^/    [CONSTRUCTOR] /' && EXTRA_FOUND=1 || true

# 7.b files that export router and also contain global[...] (suspicious immediate-exec after export)
# Find files that mention "module.exports = router" then check them for "global["
FOUND_ROUTER_FILES=$(grep -Rli -- "module.exports *= *router" "$REPO" 2>/dev/null || true)
if [ -n "$FOUND_ROUTER_FILES" ]; then
  # iterate safely
  echo "$FOUND_ROUTER_FILES" | while IFS= read -r rf; do
    if grep -nH "global\[" "$rf" >/dev/null 2>&1; then
      printf "    [GLOBAL_AFTER_EXPORT] %s\n" "$rf"
      grep -nH "global\[" "$rf" | sed 's/^/        /'
      EXTRA_FOUND=1
    fi
  done
fi

# 7.c Obfuscated shuffler pattern: function with charAt and modular math and return m.join
# This uses a conservative regex to avoid massive false positives
grep -RniE 'var\s+[^=]+=\s*function\s*\([^)]*\)\s*\{[^}]{0,300}\bcharAt\b.*%[0-9]+\).*return m\.join' "$REPO" -- 2>/dev/null | sed 's/^/    [SHUFFLER] /' && EXTRA_FOUND=1 || true

# 7.d new Function / Function constructor usage (explicit)
# Match "new Function(" or "Function(" followed by a quote (possible constructor usage)
grep -RniE 'new[[:space:]]+Function\(|Function\(\s*["'"'"']' "$REPO" -- 2>/dev/null | sed 's/^/    [FUNC_CONSTR] /' && EXTRA_FOUND=1 || true

# 7.e Very long gibberish sequences (single token > 200 chars) — possible encoded blobs
# Limit to repo files (not binary) - we use grep to find large alnum tokens
# Only show first 20 matches to keep output bounded
grep -RniE '[A-Za-z0-9_]{200,}' "$REPO" -- 2>/dev/null | sed 's/^/    [GIBBERISH] /' | head -n 20 && EXTRA_FOUND=1 || true

# If any of the extra checks flagged, reflect in main state
if [ "$EXTRA_FOUND" -eq 1 ]; then
  warn "One or more extra signature heuristics matched. Inspect files above."
  FOUND_ANY=1
else
  echo "  No extra signature heuristics matched."
fi
echo

########################
# 8) Final summarize   #
########################
echo "${GRN}== Final scan summary ==${CLR}"
if [ "$FOUND_ANY" -eq 1 ] || [ "$MINIFIED_FOUND" -eq 1 ] || [ "$B64_FOUND" -eq 1 ] || [ "$ENTROPY_FOUND" -eq 1 ] || [ "$EXTRA_FOUND" -eq 1 ]; then
  err "Potential issues found. Review flagged files manually in a sandboxed environment and rotate secrets if this repo is deployed."
  exit 2
else
  log "No obvious issues detected by heuristics. This is not confirmation of safety."
  exit 0
fi