[rulkc] [RESEND PATCH v1 1/4] scripts: Add hunspell checker
Dmitry Rokosov
rockosov at rulkc.org
Tue Jun 30 20:33:51 MSK 2026
From: Nikita Shubin <nikita.shubin at maquefel.me>
See original commit by Viacheslav Bocharov <v at baodeep.com>
https://github.com/adeepn/landau/commit/c694f9d9f41d0b5298fc822edcdfba61bceadb6c.
[Splitted checker into content repo site]
Signed-off-by: Nikita Shubin <nikita.shubin at maquefel.me>
Signed-off-by: Dmitry Rokosov <rockosov at rulkc.org>
---
.spellcheck-allow.txt | 89 +++++++++++++++++++++++++++++++++
scripts/spellcheck.sh | 112 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 201 insertions(+)
create mode 100644 .spellcheck-allow.txt
create mode 100755 scripts/spellcheck.sh
diff --git a/.spellcheck-allow.txt b/.spellcheck-allow.txt
new file mode 100644
index 000000000000..a14a49dc19eb
--- /dev/null
+++ b/.spellcheck-allow.txt
@@ -0,0 +1,89 @@
+AOSP
+BSP
+CalVer
+Dependabot
+LTS
+Lightbox
+ODM
+OSDevConf
+OpenHands
+PEM
+PRs
+RGB
+RSS
+RULKC
+SCSS
+SHA
+UI
+YAML
+backlink
+canonicalise
+changelog
+codespell
+cron
+dirs
+docroot
+env
+favicon
+flavoured
+frontmatter
+github
+hostname
+hunspell
+inlined
+labelled
+linux
+mailto
+md
+nginx
+og
+params
+px
+repo
+roadmap
+rsync
+rsyncing
+rsyncs
+rulkc
+submodule
+toolchain
+url
+апстрим
+Митапы
+Форк
+агенда
+архитектур
+бэк
+валидируемый
+валидны
+вендоров
+деплой
+десктопов
+драфты
+коммит
+коммитит
+коммиты
+конфиге
+мейнтейнеров
+мерж
+мержа
+митапы
+патчей
+перебилдит
+перевыпустили
+перегенерировать
+прогонятся
+продакшн
+продакшне
+пуше
+пушем
+разрулит
+ревью
+репозитории
+репозитория
+санкционные
+удвоений
+форк
+форка
+форке
+форки
diff --git a/scripts/spellcheck.sh b/scripts/spellcheck.sh
new file mode 100755
index 000000000000..caf08f486343
--- /dev/null
+++ b/scripts/spellcheck.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# Spell-check Markdown sources using hunspell with ru_RU + en_US dictionaries.
+# Words listed in .spellcheck-allow.txt are accepted as correct.
+#
+# Usage: scripts/spellcheck.sh [paths...]
+# Default scope when no args given: content/, docs/, README.md, AGENTS.md.
+
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "$0")/.." && pwd)
+ALLOW="${ROOT}/.spellcheck-allow.txt"
+[ -f "${ALLOW}" ] || { echo "missing allow list: ${ALLOW}" >&2; exit 1; }
+
+cd "${ROOT}"
+
+PATHS=("$@")
+if [ "${#PATHS[@]}" -eq 0 ]; then
+ PATHS=(content docs README.md AGENTS.md)
+fi
+
+# Build the file list. Avoid `mapfile`/`readarray` so the script runs on
+# Bash 3.2 (macOS default) as well as Linux's modern bash.
+files=()
+while IFS= read -r line; do
+ [ -n "${line}" ] && files+=("${line}")
+done < <(
+ for p in "${PATHS[@]}"; do
+ [ -e "${p}" ] || continue
+ if [ -d "${p}" ]; then
+ find "${p}" -type f -name '*.md'
+ else
+ printf '%s\n' "${p}"
+ fi
+ done | sort -u
+)
+
+if [ "${#files[@]}" -eq 0 ]; then
+ echo "no markdown files found"
+ exit 0
+fi
+
+# Sanity check: confirm hunspell can actually load both dictionaries.
+# (On some environments — e.g. GitHub Actions ubuntu-24.04 runner image —
+# Ubuntu's hunspell-ru / hunspell-en-us packages can install successfully
+# but leave /usr/share/hunspell/ stripped of the .aff/.dic files. Set
+# DICPATH to a directory containing ru_RU.{aff,dic} and en_US.{aff,dic} as
+# a workaround.)
+if echo 'архитектура' | hunspell -d ru_RU,en_US -l | grep -qx 'архитектура'; then
+ echo "::error::hunspell ru_RU dictionary not loaded properly"
+ echo "DICPATH=${DICPATH:-(unset)}"
+ hunspell -D 2>&1 | head -30
+ exit 1
+fi
+if echo 'architecture' | hunspell -d ru_RU,en_US -l | grep -qx 'architecture'; then
+ echo "::error::hunspell en_US dictionary not loaded properly"
+ echo "DICPATH=${DICPATH:-(unset)}"
+ hunspell -D 2>&1 | head -30
+ exit 1
+fi
+
+# Explicit template — bare `mktemp` errors on macOS/BSD (`too few X's in template`).
+unknown=$(mktemp "${TMPDIR:-/tmp}/landau-spellcheck.XXXXXX")
+trap 'rm -f "${unknown}"' EXIT
+
+for f in "${files[@]}"; do
+ awk '
+ BEGIN { fm = 0; code = 0 }
+ NR == 1 && /^---[[:space:]]*$/ { fm = 1; next }
+ fm && /^---[[:space:]]*$/ { fm = 0; next }
+ fm { next }
+ /^[[:space:]]*```/ { code = !code; next }
+ code { next }
+ {
+ gsub(/<[^>]*>/, " ") # HTML tags
+ gsub(/`[^`]*`/, " ") # inline code spans
+ gsub(/https?:\/\/[^[:space:])]+/, " ") # URLs
+ gsub(/[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+/, " ") # emails
+ gsub(/[][(){}|*_~#]/, " ") # markdown punctuation
+ # Strip invisible Unicode chars that confuse hunspell tokenizer.
+ # Byte sequences below are the UTF-8 encodings of:
+ gsub(/\xef\xb8\x8f/, " ") # U+FE0F variation selector-16
+ gsub(/\xef\xb8\x8e/, " ") # U+FE0E variation selector-15
+ gsub(/\xe2\x80\x8d/, " ") # U+200D zero-width joiner
+ gsub(/\xe2\x80\x8c/, " ") # U+200C zero-width non-joiner
+ gsub(/\xe2\x80\x8b/, " ") # U+200B zero-width space
+ print
+ }
+ ' "$f" \
+ | hunspell -d ru_RU,en_US -l \
+ >> "${unknown}"
+done
+
+# Empty unknown list = nothing to filter; happy exit.
+if [ ! -s "${unknown}" ]; then
+ echo "Spell check OK across ${#files[@]} files."
+ exit 0
+fi
+
+# Filter accumulated unknowns against the allow list (exact, fixed strings).
+sort -u "${unknown}" -o "${unknown}"
+filtered=$(grep -vxFf "${ALLOW}" "${unknown}" || true)
+
+if [ -n "${filtered}" ]; then
+ echo "::error::Unknown words found in markdown sources:"
+ printf '%s\n' "${filtered}" | sed 's/^/ /'
+ echo
+ echo "If these are intentional (proper nouns, technical terms),"
+ echo "add them to .spellcheck-allow.txt. Otherwise, fix the spelling."
+ exit 1
+fi
+
+echo "Spell check OK across ${#files[@]} files."
--
2.48.1
More information about the rulkc
mailing list