Module:User:Ssvb/ru-accentdict

The following documentation is located at Module:User:Ssvb/ru-accentdict/documentation. Categories were auto-generated by Module:documentation.

Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox

Introduction

This module provides a simple interface for testing whether a Russian word exists or not. The function lookup_word() simply returns either true or false. The words are stored with acute accents to indicate the stressed syllables. Single syllable words don't have accent. A stressed letter ё doesn't have acute accent if that's the only letter ё in the word. Examples: "стол" (single syllable), "табуре́тка" (many syllables), "трёхэтажный" (a word with ё). Much more advanced modules can be developed on top of this basic functionality.

Building the dictionary

Below is the information how to develop and upgrade this module itself.

Obtaining the list of Russian words

English Wiktionary dumps are parsed by https://kaikki.org and converted into reasonably machine readable JSON files.

Ruby script for parsing the kaikki's JSON to extract the list of Russian words:

#!/usr/bin/env ruby
# Copyright © 2024 Ssvb, CC BY-SA 4.0 license
require "json"
require "open-uri"

KAIKKI_URL = "https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json"
VOWELS = "аеёєэиіїоуюяыѣѵАЕЁЄЭИІЇОУЮЯЫѢѴ"
LETTERS = "Ѐ-џҊ-ԧꚀ-ꚗѣѢѳѲѵѴʼ"
WORD = "+"
uncategorized_words = {}
modern_words = {}
prereform_words = {}

if ARGV
  STDERR.printf("Opening a local file #{ARGV}\n")
  data = File.open(ARGV)
else
  STDERR.printf("Downloading #{KAIKKI_URL}\n")
  data = URI.open("#{KAIKKI_URL}")
end

# Convert to a canonical form:
#  1. words with only a single sylable don't need stress
#  2. words with a stressed ё letter don't need stress if the word only has a single ё
#  3. the secondary stress grave diacritic sign is not needed
def normalize(word)
  word = word.strip.gsub("̀", "")
  if word.gsub(//, "").size == 1 || (word =~ /(ё́)|(Ё́)/ &&
                                                 word.gsub(//, "").size == 1)
    word.gsub("́", "")
  else
    word
  end
end

def process_word(word, words)
    vowels_cnt = word.gsub(//, "").size
    unless word =~ /^#{WORD}$/ && (vowels_cnt <= 1 || word =~ /(.́)|/)
      if word =~ /^(#{WORD})\s+(#{WORD})$/
        # Two words, which actually act like a singe word "до́ смерти"
        word1, word2 = $1, $2
        vowels_cnt2 = word2.gsub(//, "").size
        unless vowels_cnt2 <= 1 || word2 =~ //
          # http://www.philol.msu.ru/~fonetica/akcent/bezud_slaboud/sochet.html
          words = true
          words = true
          STDERR.puts "Maybe предложно-именное сочетание \"#{word1} #{word2}\"?"
        end
      end
      return
    end
    stress_cnt = word.gsub(//, "").size
    if word =~ /́/
      STDERR.puts "Invalid stress position: #{word}"
      return
    end
    if stress_cnt >= 2
      STDERR.puts "More than one possible stressed syllable in \"#{word}\""
      tmp = word.gsub(/́/, "#")
      while tmp =~ /#/
        tmp = tmp.gsub(/́/, "").gsub(/^(*)#/, "\\1́")
        words = true
      end
    end
    words = true
end

data.each_line do |l|
  entry = JSON.parse(l)

  # Filter out any word with just a single sense marked as "Pre-1918"
  if entry.has_key?("senses") && entry.size == 1
    sense = entry
    next if sense.has_key?("glosses") && sense =~ /^Pre\-1918/
  end

  modern = {}
  prereform = {}
  prereform_mode = false
  if entry.has_key?("forms")
    entry.each do |form|
      if form.has_key?("tags") && form.include?("inflection-template")
        if form.strip =~ /^ru\-pre\-reform/
          prereform_mode = true
          next
        else
          prereform_mode = false
        end
      end
      word = normalize(form)
      if prereform_mode || (form.has_key?("tags") && form.include?("dated"))
        prereform = true
      else
        modern = true
      end
    end
    modern.each do |word, _|
      process_word(word, modern_words)
      prereform.delete(word)
    end
    prereform.each do |word, _|
      process_word(word, prereform_words)
    end
  end

  if entry.has_key?("head_templates") && entry =~ /^ru\-/
    word = entry
    process_word(normalize(word), uncategorized_words) if word && !modern.has_key?(word)
  end
end

uncategorized_words.each do |word, _|
  next if prereform_words.has_key?(word)
  modern_words = true
end

words = 
modern_words.each do |word, _|
  if word =~ // || word =~ /ъ$/
    STDERR.puts "Survived until the final safety net caught it: #{word}"
  else
    words.push(word)
  end
end

words.sort!

max_stress_pos = 0
max_stress_pos_word = ""
max_jo_pos = 0
max_jo_pos_word = ""

words_with_double_jo = 

words.each do |word|
  tmp = word.gsub(//, "").reverse
  idx = tmp.index("́")
  if idx && idx + 1 > max_stress_pos
    max_stress_pos_word = word
    max_stress_pos = idx + 1
  end

  tmp = tmp.gsub(//, "")
  idx = tmp.index(//)
  if idx && idx + 1 > max_jo_pos
    max_jo_pos_word = word
    max_jo_pos = idx + 1
  end

  if word =~ /.*/
    words_with_double_jo.push(word)
  end
end

puts "# data_source = \"#{KAIKKI_URL}\""
puts "# max_stress_search_steps = #{max_stress_pos}"
puts "# worst_stress_search_word = \"#{max_stress_pos_word}\""
puts "# max_jo_search_steps = #{max_jo_pos}"
puts "# worst_jo_search_word = \"#{max_jo_pos_word}\""
puts "# words_with_double_jo = { #{words_with_double_jo.map {|x| "\"" + x + "\"" }.join(", ")} }"
puts words.join("\n")

Converting the list of words into a Lua module

The Lua module code can be generated automatically from a list of words.

Ruby script for generating a Lua module from the supplied list of words:

#!/usr/bin/env ruby
# Copyright © 2024 Ssvb, CC BY-SA 4.0 license
require 'digest'

# Any random word, not present in the dictionary, has this chance of
# being mistakenly reported as if it were there (a false positive).
# Adjusting false positives rate also affects storage efficiency.
ACCEPTABLE_COLLISION_PROBABILITY = 1.0 / 600_000_000

class BloomFilterBuilder
  def estimate_collision_probability(buf_size, num_words, k)
    (1.0 - ((1.0 - 1.0 / (buf_size * 6)) ** (num_words * k))) ** k
  end
  def optimal_buf_size(num_words, p)
    best_buf_size = 10 ** 18
    best_k = 32
    (1 .. 32).each do |k|
      cur_size = (1 .. 10 ** 18).bsearch do |buf_size|
        estimate_collision_probability(buf_size, num_words, k) < p
      end
      if cur_size < best_buf_size
        best_buf_size = cur_size
        best_k = k
      end
    end
    return best_buf_size, best_k
  end
  def initialize(num_words, acceptable_collision_probability = 1.0 / 1000_000_000)
    @words_cnt, @p = 0, acceptable_collision_probability
    buf_size, @k = optimal_buf_size(num_words, acceptable_collision_probability)
    @buf =  * buf_size
  end
  def set_bit(bit)
    @buf |= 1 << (bit % 6)
  end
  def hash_word(word)
    word_hash1 = Digest::SHA512.hexdigest(word)
    word_hash2 = Digest::SHA512.hexdigest(word_hash1)
    return (word_hash1 + word_hash2).chars.each_slice(8).map(&:join)
      .take(@k).map {|v| v.to_i(16) % (@buf.size * 6) }
  end
  def insert_word(word)
    @words_cnt += 1
    hash_word(word).each {|bit| set_bit(bit) }
  end
  def export_to_lua(extra_info)
    base64alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
    tmp = @buf.map {|x| base64alphabet }.join.chars.each_slice(2040000).map(&:join)
    base64_lut = 0.upto(5).map do |bit|
      0.upto(63).map do |val|
        " + "\"] = 1" if (val & (1 << bit)) != 0
      end.compact.join(", ")
    end
    fprate = (1.0 / @p > 3000000) ? format("%.0f millions", 1.0 / @p / 1000000) :
                                    format("%.0f", 1.0 / @p)
    storage_eff = format("~%.1f", @buf.size.to_f / @words_cnt)

    return <<LUA_CODE_END
-- A word dictionary implementation based on the https://en.wikipedia.orghttps://dictious.com/en/Bloom_filter algorithm.
-- Capacity: #{@words_cnt} words (#{storage_eff} bytes per word). Expected false positive rate: 1 in #{fprate}.
-- Automatically generated from #{extra_info || ARGV}.
local export = {}
local bloom_filter_k = #{@k}
local bloom_filter_bitbuf = {
	#{tmp.map {|x| "\"" + x + "\"" }.join(",\n\t")}
}
local bloom_filter_base64dec_lut = {
	#{base64_lut.map {|x| "{ " + x + "}" }.join(",\n\t")}
}
if not mw then
	-- for local testing using something like:
	--   https://github.com/Egor-Skriptunoff/pure_lua_SHA/blob/master/sha2.lua
	--   https://stackoverflow.com/questions/51559181/sha512-pure-lua-5-1-adaptation/51561685#51561685
	local sha2 = require("sha2")
	mw = { = {}}
	function mw.hash.hashValue(algo, text) return sha2.sha512(text) end
end

-- Returns true if the word is found in the dictionary and false otherwise
function export.lookup_word(word)
	local h, cnt, bufsize = word, 0, 0
	for _, bitchunk in ipairs(bloom_filter_bitbuf) do
		bufsize = bufsize + string.len(bitchunk) * 6
	end
	while true do
		h = mw.hash.hashValue("sha512", h)
		for i = 1, 128, 8 do
			local idx = tonumber(h:sub(i, i + 8 - 1), 16) % bufsize
			local rem = idx % 6
			local div = (idx - rem) / 6
			for _, bitchunk in ipairs(bloom_filter_bitbuf) do
				if div + 1 <= string.len(bitchunk) then
					local val = string.sub(bitchunk, div + 1, div + 1)
					if not bloom_filter_base64dec_lut then
						return false
					end
					break
				end
				div = div - string.len(bitchunk)
			end
			cnt = cnt + 1
			if cnt >= bloom_filter_k then
				return true
			end
		end
	end
end

function export.query_extra_info()
	return {
		#{extra_info.to_a.map {|x| " + "\"] = " + x }.join(",\n\t\t") }
	}
end

return export
LUA_CODE_END
  end
end

abort "Usage: ruby #{$PROGRAM_NAME} \nWhere: words.txt - text file, one word per line\n" unless ARGV

words = {}
extra_info = {}
File.open(ARGV).each_line do |l|
  if l =~ /^#\s*(.*?)\s*\=\s*(.*?)\s*$/
    extra_info = $2
  else
    words = true
  end
end
bf_builder = BloomFilterBuilder.new(words.size, ACCEPTABLE_COLLISION_PROBABILITY)
words.keys.each {|word| bf_builder.insert_word(word) }
puts bf_builder.export_to_lua(extra_info)

If the produced Lua file is larger than the 2MB limit, and for the Russian dictionary that's how it is, then the Lua file needs to be split into multiple parts:

local local bloom_filter_bitbuf = {
	require("Module:ru-accentdict/data1"), -- return "miAokDFC+g ... KmJpQKIPk"
	require("Module:ru-accentdict/data2"), -- return "..."
	require("Module:ru-accentdict/data3"), -- return "..."
}

-- A Russian dictionary. Based on the https://en.wikipedia.orghttps://dictious.com/en/Bloom_filter algorithm.
-- Capacity: 870005 words (~7.0 bytes per word). Expected false positive rate: 1 in 600 millions.
-- Automatically generated from "enwiktionary-20240401-pages-articles-multistream.xml.bz2"
-- via https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json
local export = {}
local bloom_filter_k = 29
local bloom_filter_bitbuf = {
	require("Module:User:Ssvb/ru-accentdict/data1"),
	require("Module:User:Ssvb/ru-accentdict/data2"),
	require("Module:User:Ssvb/ru-accentdict/data3"),
}
local bloom_filter_base64dec_lut = {
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1},
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1},
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1},
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1},
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1},
	{  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1,  = 1}
}
if not mw then
	-- for local testing using something like:
	--   https://github.com/Egor-Skriptunoff/pure_lua_SHA/blob/master/sha2.lua
	--   https://stackoverflow.com/questions/51559181/sha512-pure-lua-5-1-adaptation/51561685#51561685
	local sha2 = require("sha2")
	mw = { = {}}
	function mw.hash.hashValue(algo, text) return sha2.sha512(text) end
end

-- Returns true if the word is found in the dictionary and false otherwise
function export.lookup_word(word)
	local h, cnt, bufsize = word, 0, 0
	for _, bitchunk in ipairs(bloom_filter_bitbuf) do
		bufsize = bufsize + string.len(bitchunk) * 6
	end
	while true do
		h = mw.hash.hashValue("sha512", h)
		for i = 1, 128, 8 do
			local idx = tonumber(h:sub(i, i + 8 - 1), 16) % bufsize
			local rem = idx % 6
			local div = (idx - rem) / 6
			for _, bitchunk in ipairs(bloom_filter_bitbuf) do
				if div + 1 <= string.len(bitchunk) then
					local val = string.sub(bitchunk, div + 1, div + 1)
					if not bloom_filter_base64dec_lut then
						return false
					end
					break
				end
				div = div - string.len(bitchunk)
			end
			cnt = cnt + 1
			if cnt >= bloom_filter_k then
				return true
			end
		end
	end
end

function export.query_extra_info()
	return {
		 = "https://kaikki.org/dictionary/Russian/kaikki.org-dictionary-Russian.json",
		 = 7,
		 = "благоустра́ивающаяся",
		 = 4,
		 = "посерьёзневшее",
		 = { "Бёрёлё́х", "Бёрёлё́ха", "Бёрёлё́хе", "Бёрёлё́хом", "Бёрёлё́ху", "трёхколё́сная", "трёхколё́сного", "трёхколё́сное", "трёхколё́сной", "трёхколё́сном", "трёхколё́сному", "трёхколё́сною", "трёхколё́сную", "трёхколё́сные", "трёхколё́сный", "трёхколё́сным", "трёхколё́сными", "трёхколё́сных", "четырёхзвё́здная", "четырёхзвё́здного", "четырёхзвё́здное", "четырёхзвё́здной", "четырёхзвё́здном", "четырёхзвё́здному", "четырёхзвё́здною", "четырёхзвё́здную", "четырёхзвё́здные", "четырёхзвё́здный", "четырёхзвё́здным", "четырёхзвё́здными", "четырёхзвё́здных" }
	}
end

return export

Module:User:Ssvb/ru-accentdict

Introduction

Building the dictionary

Obtaining the list of Russian words

Converting the list of words into a Lua module

Wikious

Boobota

Sagapedia