Every developer should be cautiously paranoid about code from the wild. For every code repository I do a quick security check before running it on a fresh Virtual Machine. I’ve assembled some of my various checks then used an AI code assist to make it look decent on the output.
#!/bin/sh
# repo_malware_scan.sh — POSIX-compatible Node repo heuristic scanner (with extra signatures)
# Usage: sh repo_malware_scan.sh /path/to/repo [--include-node-modules]
set -eu
REPO="${1:-.}"
INCLUDE_NODE_MODULES="${2:-}"
# Colors (no-op if not a TTY)
if [ -t 1 ]; then
RED="$(printf '\033[0;31m')" ; YEL="$(printf '\033[0;33m')" ; GRN="$(printf '\033[0;32m')" ; CLR="$(printf '\033[0m')"
else
RED="" ; YEL="" ; GRN="" ; CLR=""
fi
log() { printf "%s[INFO]%s %s\n" "$GRN" "$CLR" "$*"; }
warn() { printf "%s[WARN]%s %s\n" "$YEL" "$CLR" "$*"; }
err() { printf "%s[ALERT]%s %s\n" "$RED" "$CLR" "$*"; }
echo "${GRN}== Repo quick malware/obfuscation heuristic scan ==${CLR}"
echo "Target: $REPO"
[ "$INCLUDE_NODE_MODULES" = "--include-node-modules" ] && echo "Include node_modules: true" || echo "Include node_modules: false"
echo
# Build a portable find for extensions (BSD/GNU find compatible)
# Extensions: js,cjs,mjs,ts,tsx,jsx,html,htm,json,sh
find_files() {
if [ "$INCLUDE_NODE_MODULES" = "--include-node-modules" ]; then
find "$REPO" -type f \( \
-name "*.js" -o -name "*.cjs" -o -name "*.mjs" -o \
-name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \
-name "*.html" -o -name "*.htm" -o -name "*.json" -o -name "*.sh" \
\)
else
find "$REPO" -type f ! -path "*/node_modules/*" \( \
-name "*.js" -o -name "*.cjs" -o -name "*.mjs" -o \
-name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \
-name "*.html" -o -name "*.htm" -o -name "*.json" -o -name "*.sh" \
\)
fi
}
##############################################################################
# 1) Pattern scanning (no PCRE needed; use -E). Edit or extend below freely. #
##############################################################################
echo "${GRN}1) Pattern-based matches (eval, new Function, exec, atob, etc.)${CLR}"
FOUND_ANY=0
# Read patterns from a here-document (one per line)
PATTERNS=$(cat <<'EOF'
eval\(
new[[:space:]]+Function\(
Function\(
setTimeout\([^,]+["']
setInterval\([^,]+["']
document\.write\(
window\[['"][A-Za-z0-9_]+['"]\]
atob\(
unescape\(
fromCharCode\(
toString\(\)\.split\(
constructor\]
require\(['"]child_process['"]\)
execFile\(|execSync\(|spawnSync\(
process\.env\[
Buffer\.from\(.{50,}\)
EOF
)
# Loop files once per pattern (simpler, portable)
echo "$PATTERNS" | while IFS= read -r p; do
[ -z "$p" ] && continue
MATCHED=0
find_files | while IFS= read -r f; do
# -nH not portable everywhere; emulate with printf
if grep -n -E "$p" "$f" >/dev/null 2>&1; then
if [ $MATCHED -eq 0 ]; then
printf "%s-- Pattern: '%s'%s\n" "$YEL" "$p" "$CLR"
MATCHED=1
fi
# Print matching lines with filename prefix
grep -n -E "$p" "$f" | sed "s#^# $f:#"
FOUND_ANY=1
fi
done
done
[ "$FOUND_ANY" -eq 0 ] && echo " No obvious pattern matches found."
echo
#########################################################
# 2) Minified / single-line style (very long line > 600)#
#########################################################
echo "${GRN}2) Minified / single-line files (very long lines)${CLR}"
MINIFIED_FOUND=0
find_files | while IFS= read -r f; do
# Only check text files
if command -v file >/dev/null 2>&1 && file --mime "$f" 2>/dev/null | grep -q text; then
if awk 'length($0) > 600 {print FILENAME ":" NR ":" length($0); exit 0}' "$f" 2>/dev/null | grep -q ":"; then
awk 'length($0) > 600 {print FILENAME ":" NR ":" length($0); exit 0}' "$f" 2>/dev/null | sed "s#^# #"
echo " -> $f seems minified or obfuscated (very long lines)."
MINIFIED_FOUND=1
fi
fi
done
[ "$MINIFIED_FOUND" -eq 0 ] && echo " No minified-style single-line files detected."
echo
################################
# 3) Long base64-like strings #
################################
echo "${GRN}3) Long base64-like continuous blobs (possible embedded payloads)${CLR}"
B64_FOUND=0
B64_REGEX='[A-Za-z0-9+/]{120,}={0,2}'
find_files | while IFS= read -r f; do
if grep -n -E "$B64_REGEX" "$f" >/dev/null 2>&1; then
printf "%s-- Potential base64 blob in %s%s\n" "$YEL" "$f" "$CLR"
grep -n -E "$B64_REGEX" "$f" | sed 's/^/ /'
B64_FOUND=1
fi
done
[ "$B64_FOUND" -eq 0 ] && echo " No very long base64-like blobs found."
echo
########################################
# 4) High-entropy lines (obfuscation) #
########################################
echo "${GRN}4) High-entropy lines detection (possible obfuscation)${CLR}"
ENTROPY_FOUND=0
# Entropy threshold *100 (e.g., 450 = 4.50 bits/char)
ENTROPY_THRESHOLD=450
find_files | while IFS= read -r f; do
# Skip tiny files (<1KB) for speed
# Use portable size retrieval
SIZE=0
if command -v stat >/dev/null 2>&1; then
# Try GNU, then BSD stat
SIZE=$( (stat -c%s "$f" 2>/dev/null || stat -f%z "$f" 2>/dev/null || echo 0) )
fi
[ "$SIZE" -lt 1024 ] && continue
awk -v threshold="$ENTROPY_THRESHOLD" '{
n = length($0)
if (n < 40) next
delete c; e = 0
for (i=1;i<=n;i++){ ch=substr($0,i,1); c[ch]++ }
for (k in c){ p=c[k]/n; e+= -p*(log(p)/log(2)) }
E = int(e*100)
if (E >= threshold) {
printf(" %s:%d:entropy=%d:length=%d\n", FILENAME, NR, E, n)
found=1
}
} END { if (found) exit 0; else exit 0 }' "$f" && :
# We can’t set a shell var inside awk portably; just mark if any line printed:
if awk -v threshold="$ENTROPY_THRESHOLD" '{
n = length($0); if (n < 40) next
delete c; e = 0
for (i=1;i<=n;i++){ ch=substr($0,i,1); c[ch]++ }
for (k in c){ p=c[k]/n; e+= -p*(log(p)/log(2)) }
if (int(e*100) >= threshold) { print "x"; exit 0 }
}' "$f" | grep -q x; then
ENTROPY_FOUND=1
fi
done
if [ "$ENTROPY_FOUND" -eq 0 ]; then
echo " No high-entropy lines detected above threshold ${ENTROPY_THRESHOLD}."
else
echo " (High entropy lines are heuristics for obfuscation; manually inspect flagged files above.)"
fi
echo
########################################################
# 5) package.json suspicious lifecycle & dependencies #
########################################################
echo "${GRN}5) package.json checks (suspicious scripts / odd deps)${CLR}"
PKG="$REPO/package.json"
if [ -f "$PKG" ]; then
echo " Showing potential lifecycle scripts (rough grep):"
sed -n '1,200p' "$PKG" | grep -n '"scripts"' -n -A 20 | sed 's/^/ /' || true
for s in postinstall install preinstall prepare; do
if grep -q "\"$s\"" "$PKG"; then
warn " package.json defines '$s' script. Review what runs during install."
fi
done
if grep -E -i '"(dependencies|devDependencies)"' -n "$PKG" >/dev/null 2>&1; then
if grep -E -i '"(cryptominer|mine|malware|backdoor|stealer|rat|trojan)"' "$PKG" >/dev/null 2>&1; then
warn " Dependency names include suspicious keywords; inspect package.json."
fi
fi
else
echo " No package.json found at repo root."
fi
echo
########################################################
# 6) Summarize results (pre-extra-signatures check) #
########################################################
echo "${GRN}== Preliminary results ==${CLR}"
if [ "$FOUND_ANY" -eq 1 ] || [ "$MINIFIED_FOUND" -eq 1 ] || [ "$B64_FOUND" -eq 1 ] || [ "$ENTROPY_FOUND" -eq 1 ]; then
err "Potential issues found in earlier heuristics. Extra signature checks will run next."
else
log "No obvious issues detected by heuristics so far. Proceeding with extra signature checks."
fi
echo
########################################################
# 7) Extra signature heuristics (constructor, IIFE etc)#
########################################################
echo "${GRN}7) Extra signature heuristics (constructor trick, suspicious globals/IIFE, obfuscator shufflers, new Function, long gibberish)${CLR}"
EXTRA_FOUND=0
# 7.a constructor property access patterns (e.g. obj['constructor'] or ['constructor'])
# quote carefully to allow single/double quote variants
grep -RniE '\[\s*["'"'"']constructor["'"'"']\s*\]' "$REPO" -- 2>/dev/null | sed 's/^/ [CONSTRUCTOR] /' && EXTRA_FOUND=1 || true
# 7.b files that export router and also contain global[...] (suspicious immediate-exec after export)
# Find files that mention "module.exports = router" then check them for "global["
FOUND_ROUTER_FILES=$(grep -Rli -- "module.exports *= *router" "$REPO" 2>/dev/null || true)
if [ -n "$FOUND_ROUTER_FILES" ]; then
# iterate safely
echo "$FOUND_ROUTER_FILES" | while IFS= read -r rf; do
if grep -nH "global\[" "$rf" >/dev/null 2>&1; then
printf " [GLOBAL_AFTER_EXPORT] %s\n" "$rf"
grep -nH "global\[" "$rf" | sed 's/^/ /'
EXTRA_FOUND=1
fi
done
fi
# 7.c Obfuscated shuffler pattern: function with charAt and modular math and return m.join
# This uses a conservative regex to avoid massive false positives
grep -RniE 'var\s+[^=]+=\s*function\s*\([^)]*\)\s*\{[^}]{0,300}\bcharAt\b.*%[0-9]+\).*return m\.join' "$REPO" -- 2>/dev/null | sed 's/^/ [SHUFFLER] /' && EXTRA_FOUND=1 || true
# 7.d new Function / Function constructor usage (explicit)
# Match "new Function(" or "Function(" followed by a quote (possible constructor usage)
grep -RniE 'new[[:space:]]+Function\(|Function\(\s*["'"'"']' "$REPO" -- 2>/dev/null | sed 's/^/ [FUNC_CONSTR] /' && EXTRA_FOUND=1 || true
# 7.e Very long gibberish sequences (single token > 200 chars) — possible encoded blobs
# Limit to repo files (not binary) - we use grep to find large alnum tokens
# Only show first 20 matches to keep output bounded
grep -RniE '[A-Za-z0-9_]{200,}' "$REPO" -- 2>/dev/null | sed 's/^/ [GIBBERISH] /' | head -n 20 && EXTRA_FOUND=1 || true
# If any of the extra checks flagged, reflect in main state
if [ "$EXTRA_FOUND" -eq 1 ]; then
warn "One or more extra signature heuristics matched. Inspect files above."
FOUND_ANY=1
else
echo " No extra signature heuristics matched."
fi
echo
########################
# 8) Final summarize #
########################
echo "${GRN}== Final scan summary ==${CLR}"
if [ "$FOUND_ANY" -eq 1 ] || [ "$MINIFIED_FOUND" -eq 1 ] || [ "$B64_FOUND" -eq 1 ] || [ "$ENTROPY_FOUND" -eq 1 ] || [ "$EXTRA_FOUND" -eq 1 ]; then
err "Potential issues found. Review flagged files manually in a sandboxed environment and rotate secrets if this repo is deployed."
exit 2
else
log "No obvious issues detected by heuristics. This is not confirmation of safety."
exit 0
fi