This module provides a simple interface for testing whether a Russian word exists or not. The function lookup_word()
simply returns either true or false. The words are stored with acute accents to indicate the stressed syllables. Single syllable words don't have accent. A stressed letter ё doesn't have acute accent if that's the only letter ё in the word. Examples: "стол" (single syllable), "табуре́тка" (many syllables), "трёхэтажный" (a word with ё). Much more advanced modules can be developed on top of this basic functionality.
Below is the information how to develop and upgrade this module itself.
English Wiktionary dumps are parsed by https://kaikki.org and converted into reasonably machine readable JSON files.
#!/usr/bin/env ruby
# Copyright © 2024 Ssvb, CC BY-SA 4.0 license
require "json"
require "open-uri"
KAIKKI_URL = "https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json"
VOWELS = "аеёєэиіїоуюяыѣѵАЕЁЄЭИІЇОУЮЯЫѢѴ"
LETTERS = "Ѐ-џҊ-ԧꚀ-ꚗѣѢѳѲѵѴʼ"
WORD = "+"
uncategorized_words = {}
modern_words = {}
prereform_words = {}
if ARGV
STDERR.printf("Opening a local file #{ARGV}\n")
data = File.open(ARGV)
else
STDERR.printf("Downloading #{KAIKKI_URL}\n")
data = URI.open("#{KAIKKI_URL}")
end
# Convert to a canonical form:
# 1. words with only a single sylable don't need stress
# 2. words with a stressed ё letter don't need stress if the word only has a single ё
# 3. the secondary stress grave diacritic sign is not needed
def normalize(word)
word = word.strip.gsub("̀", "")
if word.gsub(//, "").size == 1 || (word =~ /(ё́)|(Ё́)/ &&
word.gsub(//, "").size == 1)
word.gsub("́", "")
else
word
end
end
def process_word(word, words)
vowels_cnt = word.gsub(//, "").size
unless word =~ /^#{WORD}$/ && (vowels_cnt <= 1 || word =~ /(.́)|/)
if word =~ /^(#{WORD})\s+(#{WORD})$/
# Two words, which actually act like a singe word "до́ смерти"
word1, word2 = $1, $2
vowels_cnt2 = word2.gsub(//, "").size
unless vowels_cnt2 <= 1 || word2 =~ //
# http://www.philol.msu.ru/~fonetica/akcent/bezud_slaboud/sochet.html
words = true
words = true
STDERR.puts "Maybe предложно-именное сочетание \"#{word1} #{word2}\"?"
end
end
return
end
stress_cnt = word.gsub(//, "").size
if word =~ /́/
STDERR.puts "Invalid stress position: #{word}"
return
end
if stress_cnt >= 2
STDERR.puts "More than one possible stressed syllable in \"#{word}\""
tmp = word.gsub(/́/, "#")
while tmp =~ /#/
tmp = tmp.gsub(/́/, "").gsub(/^(*)#/, "\\1́")
words = true
end
end
words = true
end
data.each_line do |l|
entry = JSON.parse(l)
# Filter out any word with just a single sense marked as "Pre-1918"
if entry.has_key?("senses") && entry.size == 1
sense = entry
next if sense.has_key?("glosses") && sense =~ /^Pre\-1918/
end
modern = {}
prereform = {}
prereform_mode = false
if entry.has_key?("forms")
entry.each do |form|
if form.has_key?("tags") && form.include?("inflection-template")
if form.strip =~ /^ru\-pre\-reform/
prereform_mode = true
next
else
prereform_mode = false
end
end
word = normalize(form)
if prereform_mode || (form.has_key?("tags") && form.include?("dated"))
prereform = true
else
modern = true
end
end
modern.each do |word, _|
process_word(word, modern_words)
prereform.delete(word)
end
prereform.each do |word, _|
process_word(word, prereform_words)
end
end
if entry.has_key?("head_templates") && entry =~ /^ru\-/
word = entry
process_word(normalize(word), uncategorized_words) if word && !modern.has_key?(word)
end
end
uncategorized_words.each do |word, _|
next if prereform_words.has_key?(word)
modern_words = true
end
words =
modern_words.each do |word, _|
if word =~ // || word =~ /ъ$/
STDERR.puts "Survived until the final safety net caught it: #{word}"
else
words.push(word)
end
end
words.sort!
max_stress_pos = 0
max_stress_pos_word = ""
max_jo_pos = 0
max_jo_pos_word = ""
words_with_double_jo =
words.each do |word|
tmp = word.gsub(//, "").reverse
idx = tmp.index("́")
if idx && idx + 1 > max_stress_pos
max_stress_pos_word = word
max_stress_pos = idx + 1
end
tmp = tmp.gsub(//, "")
idx = tmp.index(//)
if idx && idx + 1 > max_jo_pos
max_jo_pos_word = word
max_jo_pos = idx + 1
end
if word =~ /.*/
words_with_double_jo.push(word)
end
end
puts "# data_source = \"#{KAIKKI_URL}\""
puts "# max_stress_search_steps = #{max_stress_pos}"
puts "# worst_stress_search_word = \"#{max_stress_pos_word}\""
puts "# max_jo_search_steps = #{max_jo_pos}"
puts "# worst_jo_search_word = \"#{max_jo_pos_word}\""
puts "# words_with_double_jo = { #{words_with_double_jo.map {|x| "\"" + x + "\"" }.join(", ")} }"
puts words.join("\n")
The Lua module code can be generated automatically from a list of words.
#!/usr/bin/env ruby
# Copyright © 2024 Ssvb, CC BY-SA 4.0 license
require 'digest'
# Any random word, not present in the dictionary, has this chance of
# being mistakenly reported as if it were there (a false positive).
# Adjusting false positives rate also affects storage efficiency.
ACCEPTABLE_COLLISION_PROBABILITY = 1.0 / 600_000_000
class BloomFilterBuilder
def estimate_collision_probability(buf_size, num_words, k)
(1.0 - ((1.0 - 1.0 / (buf_size * 6)) ** (num_words * k))) ** k
end
def optimal_buf_size(num_words, p)
best_buf_size = 10 ** 18
best_k = 32
(1 .. 32).each do |k|
cur_size = (1 .. 10 ** 18).bsearch do |buf_size|
estimate_collision_probability(buf_size, num_words, k) < p
end
if cur_size < best_buf_size
best_buf_size = cur_size
best_k = k
end
end
return best_buf_size, best_k
end
def initialize(num_words, acceptable_collision_probability = 1.0 / 1000_000_000)
@words_cnt, @p = 0, acceptable_collision_probability
buf_size, @k = optimal_buf_size(num_words, acceptable_collision_probability)
@buf = * buf_size
end
def set_bit(bit)
@buf |= 1 << (bit % 6)
end
def hash_word(word)
word_hash1 = Digest::SHA512.hexdigest(word)
word_hash2 = Digest::SHA512.hexdigest(word_hash1)
return (word_hash1 + word_hash2).chars.each_slice(8).map(&:join)
.take(@k).map {|v| v.to_i(16) % (@buf.size * 6) }
end
def insert_word(word)
@words_cnt += 1
hash_word(word).each {|bit| set_bit(bit) }
end
def export_to_lua(extra_info)
base64alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
tmp = @buf.map {|x| base64alphabet }.join.chars.each_slice(2040000).map(&:join)
base64_lut = 0.upto(5).map do |bit|
0.upto(63).map do |val|
" + "\"] = 1" if (val & (1 << bit)) != 0
end.compact.join(", ")
end
fprate = (1.0 / @p > 3000000) ? format("%.0f millions", 1.0 / @p / 1000000) :
format("%.0f", 1.0 / @p)
storage_eff = format("~%.1f", @buf.size.to_f / @words_cnt)
return <<LUA_CODE_END
-- A word dictionary implementation based on the https://en.wikipedia.orghttps://dictious.com/en/Bloom_filter algorithm.
-- Capacity: #{@words_cnt} words (#{storage_eff} bytes per word). Expected false positive rate: 1 in #{fprate}.
-- Automatically generated from #{extra_info || ARGV}.
local export = {}
local bloom_filter_k = #{@k}
local bloom_filter_bitbuf = {
#{tmp.map {|x| "\"" + x + "\"" }.join(",\n\t")}
}
local bloom_filter_base64dec_lut = {
#{base64_lut.map {|x| "{ " + x + "}" }.join(",\n\t")}
}
if not mw then
-- for local testing using something like:
-- https://github.com/Egor-Skriptunoff/pure_lua_SHA/blob/master/sha2.lua
-- https://stackoverflow.com/questions/51559181/sha512-pure-lua-5-1-adaptation/51561685#51561685
local sha2 = require("sha2")
mw = { = {}}
function mw.hash.hashValue(algo, text) return sha2.sha512(text) end
end
-- Returns true if the word is found in the dictionary and false otherwise
function export.lookup_word(word)
local h, cnt, bufsize = word, 0, 0
for _, bitchunk in ipairs(bloom_filter_bitbuf) do
bufsize = bufsize + string.len(bitchunk) * 6
end
while true do
h = mw.hash.hashValue("sha512", h)
for i = 1, 128, 8 do
local idx = tonumber(h:sub(i, i + 8 - 1), 16) % bufsize
local rem = idx % 6
local div = (idx - rem) / 6
for _, bitchunk in ipairs(bloom_filter_bitbuf) do
if div + 1 <= string.len(bitchunk) then
local val = string.sub(bitchunk, div + 1, div + 1)
if not bloom_filter_base64dec_lut then
return false
end
break
end
div = div - string.len(bitchunk)
end
cnt = cnt + 1
if cnt >= bloom_filter_k then
return true
end
end
end
end
function export.query_extra_info()
return {
#{extra_info.to_a.map {|x| " + "\"] = " + x }.join(",\n\t\t") }
}
end
return export
LUA_CODE_END
end
end
abort "Usage: ruby #{$PROGRAM_NAME} \nWhere: words.txt - text file, one word per line\n" unless ARGV
words = {}
extra_info = {}
File.open(ARGV).each_line do |l|
if l =~ /^#\s*(.*?)\s*\=\s*(.*?)\s*$/
extra_info = $2
else
words = true
end
end
bf_builder = BloomFilterBuilder.new(words.size, ACCEPTABLE_COLLISION_PROBABILITY)
words.keys.each {|word| bf_builder.insert_word(word) }
puts bf_builder.export_to_lua(extra_info)
If the produced Lua file is larger than the 2MB limit, and for the Russian dictionary that's how it is, then the Lua file needs to be split into multiple parts:
local local bloom_filter_bitbuf = {
require("Module:ru-accentdict/data1"), -- return "miAokDFC+g ... KmJpQKIPk"
require("Module:ru-accentdict/data2"), -- return "..."
require("Module:ru-accentdict/data3"), -- return "..."
}
-- A Russian dictionary. Based on the https://en.wikipedia.orghttps://dictious.com/en/Bloom_filter algorithm.
-- Capacity: 870005 words (~7.0 bytes per word). Expected false positive rate: 1 in 600 millions.
-- Automatically generated from "enwiktionary-20240401-pages-articles-multistream.xml.bz2"
-- via https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json
local export = {}
local bloom_filter_k = 29
local bloom_filter_bitbuf = {
require("Module:User:Ssvb/ru-accentdict/data1"),
require("Module:User:Ssvb/ru-accentdict/data2"),
require("Module:User:Ssvb/ru-accentdict/data3"),
}
local bloom_filter_base64dec_lut = {
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1},
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1},
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1},
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1},
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1},
{ = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1, = 1}
}
if not mw then
-- for local testing using something like:
-- https://github.com/Egor-Skriptunoff/pure_lua_SHA/blob/master/sha2.lua
-- https://stackoverflow.com/questions/51559181/sha512-pure-lua-5-1-adaptation/51561685#51561685
local sha2 = require("sha2")
mw = { = {}}
function mw.hash.hashValue(algo, text) return sha2.sha512(text) end
end
-- Returns true if the word is found in the dictionary and false otherwise
function export.lookup_word(word)
local h, cnt, bufsize = word, 0, 0
for _, bitchunk in ipairs(bloom_filter_bitbuf) do
bufsize = bufsize + string.len(bitchunk) * 6
end
while true do
h = mw.hash.hashValue("sha512", h)
for i = 1, 128, 8 do
local idx = tonumber(h:sub(i, i + 8 - 1), 16) % bufsize
local rem = idx % 6
local div = (idx - rem) / 6
for _, bitchunk in ipairs(bloom_filter_bitbuf) do
if div + 1 <= string.len(bitchunk) then
local val = string.sub(bitchunk, div + 1, div + 1)
if not bloom_filter_base64dec_lut then
return false
end
break
end
div = div - string.len(bitchunk)
end
cnt = cnt + 1
if cnt >= bloom_filter_k then
return true
end
end
end
end
function export.query_extra_info()
return {
= "https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json",
= 7,
= "благоустра́ивающаяся",
= 4,
= "посерьёзневшее",
= { "Бёрёлё́х", "Бёрёлё́ха", "Бёрёлё́хе", "Бёрёлё́хом", "Бёрёлё́ху", "трёхколё́сная", "трёхколё́сного", "трёхколё́сное", "трёхколё́сной", "трёхколё́сном", "трёхколё́сному", "трёхколё́сною", "трёхколё́сную", "трёхколё́сные", "трёхколё́сный", "трёхколё́сным", "трёхколё́сными", "трёхколё́сных", "четырёхзвё́здная", "четырёхзвё́здного", "четырёхзвё́здное", "четырёхзвё́здной", "четырёхзвё́здном", "четырёхзвё́здному", "четырёхзвё́здною", "четырёхзвё́здную", "четырёхзвё́здные", "четырёхзвё́здный", "четырёхзвё́здным", "четырёхзвё́здными", "четырёхзвё́здных" }
}
end
return export