-- turkce-sayi.lua -- Turkish suffix harmony for LaTeX-generated numbers. -- Requires LuaTeX 1.10+ (Lua 5.3+). -- License: LPPL 1.3c https://www.latex-project.org/lppl.txt -- Maintainer: Sonat Suer local M = {} ------------------------------------------------------------------------ -- Phonological properties of the last spoken component of each number. -- f: final vowel is front (e/i/ö/ü = true; a/ı/o/u = false) -- r: final vowel is rounded (ö/ü/o/u = true) -- v: word ends in a vowel (triggers buffer/drop rule) -- d: last phoneme is voiced (vowel or voiced consonant) ------------------------------------------------------------------------ local WORD_PROPS = { [0] = {f=false, r=false, v=false, d=true}, -- sıfır (ı, r) [1] = {f=true, r=false, v=false, d=true}, -- bir (i, r) [2] = {f=true, r=false, v=true, d=true}, -- iki (ends in i) [3] = {f=true, r=true, v=false, d=false}, -- üç (ü, ç voiceless) [4] = {f=true, r=true, v=false, d=false}, -- dört (ö, t voiceless) [5] = {f=true, r=false, v=false, d=false}, -- beş (e, ş voiceless) [6] = {f=false, r=false, v=true, d=true}, -- altı (ends in ı) [7] = {f=true, r=false, v=true, d=true}, -- yedi (ends in i) [8] = {f=true, r=false, v=false, d=true}, -- sekiz (i, z) [9] = {f=false, r=true, v=false, d=true}, -- dokuz (u, z) [10] = {f=false, r=true, v=false, d=true}, -- on (o, n) [20] = {f=true, r=false, v=true, d=true}, -- yirmi (ends in i) [30] = {f=false, r=true, v=false, d=true}, -- otuz (u, z) [40] = {f=false, r=false, v=false, d=false}, -- kırk (ı, k voiceless) [50] = {f=true, r=false, v=true, d=true}, -- elli (ends in i) [60] = {f=false, r=false, v=false, d=false}, -- altmış (ı, ş voiceless) [70] = {f=true, r=false, v=false, d=false}, -- yetmiş (i, ş voiceless) [80] = {f=true, r=false, v=false, d=true}, -- seksen (e, n) [90] = {f=false, r=false, v=false, d=true}, -- doksan (a, n) [100] = {f=true, r=true, v=false, d=true}, -- yüz (ü, z) [1000] = {f=true, r=false, v=false, d=true}, -- bin (i, n) [1000000] = {f=false, r=true, v=false, d=true}, -- milyon (o, n) [1000000000] = {f=false, r=false, v=false, d=true}, -- milyar (a, r) [1000000000000] = {f=false, r=true, v=false, d=true}, -- trilyon(o, n) } local MAGNITUDES = { 1000000000000, 1000000000, 1000000, 1000, 100, 90, 80, 70, 60, 50, 40, 30, 20, 10, } local function last_word(n) n = math.floor(math.abs(n) + 0.5) if n == 0 then return WORD_PROPS[0] end local props = WORD_PROPS[0] for _, mag in ipairs(MAGNITUDES) do if math.floor(n / mag) > 0 then props = WORD_PROPS[mag] n = n % mag end end if n > 0 then props = WORD_PROPS[n] end return props end ------------------------------------------------------------------------ -- Character classification (Unicode codepoints) ------------------------------------------------------------------------ -- Lowercase Turkish specials: ı=0x0131 ü=0x00FC ö=0x00F6 ç=0x00E7 ş=0x015F ğ=0x011F -- Uppercase Turkish specials: İ=0x0130 Ü=0x00DC Ö=0x00D6 Ç=0x00C7 Ş=0x015E Ğ=0x011E -- Turkish case rule: i(0x69) ↔ İ(0x0130), ı(0x0131) ↔ I(0x0049) local UPPER_TO_LOWER = { [0x41]=0x61, [0x42]=0x62, [0x43]=0x63, [0x44]=0x64, [0x45]=0x65, [0x46]=0x66, [0x47]=0x67, [0x48]=0x68, [0x49]=0x0131, -- I → ı (Turkish: capital I is dotless) [0x4A]=0x6A, [0x4B]=0x6B, [0x4C]=0x6C, [0x4D]=0x6D, [0x4E]=0x6E, [0x4F]=0x6F, [0x50]=0x70, [0x52]=0x72, [0x53]=0x73, [0x54]=0x74, [0x55]=0x75, [0x56]=0x76, [0x59]=0x79, [0x5A]=0x7A, [0x00C7]=0x00E7, -- Ç → ç [0x00D6]=0x00F6, -- Ö → ö [0x00DC]=0x00FC, -- Ü → ü [0x011E]=0x011F, -- Ğ → ğ [0x015E]=0x015F, -- Ş → ş [0x0130]=0x69, -- İ → i (Turkish: capital dotted İ) } local LOWER_TO_UPPER = { [0x61]=0x41, [0x62]=0x42, [0x63]=0x43, [0x64]=0x44, [0x65]=0x45, [0x66]=0x46, [0x67]=0x47, [0x68]=0x48, [0x69]=0x0130, -- i → İ (Turkish!) [0x6A]=0x4A, [0x6B]=0x4B, [0x6C]=0x4C, [0x6D]=0x4D, [0x6E]=0x4E, [0x6F]=0x4F, [0x70]=0x50, [0x72]=0x52, [0x73]=0x53, [0x74]=0x54, [0x75]=0x55, [0x76]=0x56, [0x79]=0x59, [0x7A]=0x5A, [0x00E7]=0x00C7, -- ç → Ç [0x00F6]=0x00D6, -- ö → Ö [0x00FC]=0x00DC, -- ü → Ü [0x011F]=0x011E, -- ğ → Ğ [0x015F]=0x015E, -- ş → Ş [0x0131]=0x49, -- ı → I (Turkish!) } local function is_letter(cp) return UPPER_TO_LOWER[cp] ~= nil or (cp >= 0x61 and cp <= 0x7A) -- a-z or cp == 0x0131 or cp == 0x00FC or cp == 0x00F6 or cp == 0x00E7 or cp == 0x015F or cp == 0x011F end -- Return the utf8 string for cp_out, uppercased when was_upper is true. local function apply_case(cp_out, was_upper) if not was_upper then return utf8.char(cp_out) end return utf8.char(LOWER_TO_UPPER[cp_out] or cp_out) end local HARMONY_V = { -- vowels that transform under harmony [0x61]=true, [0x65]=true, [0x69]=true, [0x75]=true, -- a e i u [0x0131]=true, [0x00FC]=true, -- ı ü } local INVARIANT_V = { [0x6F]=true, [0x00F6]=true } -- o ö (pass through) local function is_vowel(cp) return HARMONY_V[cp] or INVARIANT_V[cp] end local VOICED_C = { [0x62]=true, [0x63]=true, [0x64]=true, [0x67]=true, [0x011F]=true, -- ğ [0x6A]=true, [0x6C]=true, [0x6D]=true, [0x6E]=true, [0x72]=true, [0x76]=true, [0x79]=true, [0x7A]=true, } local function is_voiced(cp) return is_vowel(cp) or (VOICED_C[cp] == true) end -- 4-way harmony vowel: returns the codepoint. local function vowel4cp(f, r) if f and not r then return 0x69 -- i elseif f and r then return 0x00FC -- ü elseif not f and not r then return 0x0131 -- ı else return 0x75 -- u end end ------------------------------------------------------------------------ -- Invariant morphemes (Büyük Ünlü Uyumuna Uymayan Ekler), -- longest first to prevent partial prefix matches. -- f, r: harmony class of the morpheme's last vowel (update running state). -- voiced: voicing of the morpheme's last character. ------------------------------------------------------------------------ local INVARIANT = { -- ı = \xC4\xB1 {str="mt\xC4\xB1rak", f=false, r=false, voiced=false}, -- mtırak (a, last=k) {str="leyin", f=true, r=false, voiced=true}, -- leyin (i, last=n) {str="gil", f=true, r=false, voiced=true}, -- gil (i, last=l) {str="ken", f=true, r=false, voiced=true}, -- ken (e, last=n) {str="yor", f=false, r=true, voiced=true}, -- yor (o, last=r) {str="ki", f=true, r=false, voiced=true}, -- ki (i, last=i) } -- Pre-compute char counts and codepoint arrays for invariant morphemes. for _, m in ipairs(INVARIANT) do local cnt, cps = 0, {} for _, cp in utf8.codes(m.str) do cnt = cnt + 1; cps[cnt] = cp end m.char_count = cnt m.cps = cps -- lowercase codepoints for case-insensitive matching end -- Case-insensitive check: does chars[idx..] start with morpheme m? local function matches_morpheme(chars, idx, m) if idx + m.char_count - 1 > #chars then return false end for j = 1, m.char_count do if chars[idx + j - 1].lower ~= m.cps[j] then return false end end return true end ------------------------------------------------------------------------ -- harmonize(suffix, props) -- Returns prefix (leading non-letter punctuation), algo (transformed letters). -- The caller concatenates them; splitting allows override lookup on algo alone. ------------------------------------------------------------------------ local function harmonize(suffix, props) local f = props.f local r = props.r local last_voiced = props.v or props.d -- Step 0: extract leading non-letter prefix (e.g. apostrophe). local prefix = "" local lsuffix = "" local found = false for p, cp in utf8.codes(suffix) do if is_letter(cp) then lsuffix = suffix:sub(p) found = true break end prefix = prefix .. utf8.char(cp) end if not found then return prefix, "" end -- Build codepoint list with case info: {cp, lower, upper} local chars = {} for _, cp in utf8.codes(lsuffix) do local lo = UPPER_TO_LOWER[cp] chars[#chars + 1] = {cp=cp, lower=lo or cp, upper=lo ~= nil} end local nch = #chars if nch == 0 then return prefix, "" end local result = "" local idx = 1 -- ── Step 1: kaynaştırma harfi / ünlü düşmesi ───────────────────── -- Applies only when stem ends in a vowel and suffix starts with one. if props.v and HARMONY_V[chars[1].lower] then if nch == 1 then -- Case A: single-vowel suffix → y buffer (accusative default) result = result .. "y" elseif nch == 2 and chars[2].lower == 0x6E then -- Case B: V + n → n buffer (genitive) result = result .. "n" else -- Case C: all other vowel-initial suffixes → drop initial vowel idx = 2 end end -- ── Step 2: character-by-character scan ────────────────────────── while idx <= nch do local ch = chars[idx] local cp = ch.lower -- lowercase codepoint drives all logic local up = ch.upper -- true if original character was uppercase -- Check for invariant morpheme (case-insensitive). local hit = false for _, m in ipairs(INVARIANT) do if matches_morpheme(chars, idx, m) then -- Output original characters to preserve case. for j = 0, m.char_count - 1 do result = result .. utf8.char(chars[idx + j].cp) end f = m.f r = m.r last_voiced = m.voiced idx = idx + m.char_count hit = true break end end if hit then goto continue end -- daş/taş: leading consonant alternates, vowel 'a' is invariant. -- ş = U+015F if idx + 2 <= nch then local c1 = chars[idx].lower local c2 = chars[idx + 1].lower local c3 = chars[idx + 2].lower if (c1 == 0x64 or c1 == 0x74) and c2 == 0x61 and c3 == 0x015F then result = result .. apply_case(last_voiced and 0x64 or 0x74, chars[idx].upper) .. apply_case(0x61, chars[idx + 1].upper) .. apply_case(0x015F, chars[idx + 2].upper) f = false -- 'a' is back r = false -- 'a' is unrounded last_voiced = false -- ş is voiceless idx = idx + 3 goto continue end end if cp == 0x64 or cp == 0x74 then -- d / t: hardening or softening if last_voiced then result = result .. apply_case(0x64, up); last_voiced = true else result = result .. apply_case(0x74, up); last_voiced = false end elseif cp == 0x63 or cp == 0x00E7 then -- c / ç if last_voiced then result = result .. apply_case(0x63, up); last_voiced = true else result = result .. apply_case(0x00E7, up); last_voiced = false end elseif cp == 0x65 or cp == 0x61 then -- e / a: 2-way harmony result = result .. apply_case(f and 0x65 or 0x61, up); last_voiced = true elseif HARMONY_V[cp] then -- i ı ü u: 4-way harmony result = result .. apply_case(vowel4cp(f, r), up); last_voiced = true elseif INVARIANT_V[cp] then -- o / ö: pass through, preserve case result = result .. apply_case(cp, up); last_voiced = true else -- all other characters: verbatim result = result .. utf8.char(ch.cp) last_voiced = is_voiced(cp) end idx = idx + 1 ::continue:: end return prefix, result end ------------------------------------------------------------------------ -- Public API ------------------------------------------------------------------------ -- overrides[n][algo] = replacement -- algo is the letter-only suffix the algorithm produces (e.g. "te", "üncü", "da"). local overrides = {} function M.set_override(n, algo_output, replacement) local k = math.floor(tonumber(n) or 0) if not overrides[k] then overrides[k] = {} end overrides[k][algo_output] = replacement end function M.remove_override(n, algo_output) local k = math.floor(tonumber(n) or 0) if overrides[k] then overrides[k][algo_output] = nil end end M.debug = false function M.output_suffix(n_str, suffix) -- For multi-level refs like "2.3" use the last component (3 = üç, not 2 = iki). -- tostring() preserves the full string when n_str is already a string (e.g. "1.10"). local last = tostring(n_str):match("[^.]+$") or tostring(n_str) local n = math.floor(tonumber(last) or 0) if M.debug then texio.write_nl("turkce-sayi: n=" .. n .. " suffix=[" .. suffix .. "]") end local props = last_word(n) local prefix, algo = harmonize(suffix, props) if overrides[n] then local replacement = overrides[n][algo] if replacement then algo = replacement end end if M.debug then texio.write_nl("turkce-sayi: result=[" .. prefix .. algo .. "]") end tex.sprint(prefix .. algo) end -- bib_output_suffix(keys, suffix) -- keys: comma-separated cite key list (e.g. "smith,jones") -- Looks up the citation number of the LAST key via token.get_macro("b@") -- and outputs the transformed suffix. Requires two LaTeX passes (the cite -- number is written to .aux on pass 1 and read back on pass 2). function M.bib_output_suffix(keys, suffix) local last_key = keys:match("[^,]+$") if last_key then last_key = last_key:match("^%s*(.-)%s*$") end local n = 0 if last_key and last_key ~= "" then local val = token.get_macro("b@" .. last_key) n = tonumber(val) or 0 end M.output_suffix(n, suffix) end return M