-- texdoclib-score.tlu: scoring functions for texdoc -- -- The TeX Live Team, GPLv3, see texdoclib.tlu for details -- dependencies local md5 = require 'md5' local texdoc = { const = require 'texdoclib-const', util = require 'texdoclib-util', config = require 'texdoclib-config', } -- shortcuts local M = {} -- shared variables local global_adjscore, spec_adjscore = {}, {} ------------------------- configuration directives ------------------------- -- set key in score table to val, without overriding local function set_score_table(tab, key, val) local k = string.lower(key) local v = tonumber(val) if v then if tab[k] == nil then tab[k] = v end return true end return false end -- interpret a confline as a score directive or return false function M.confline_to_score(line) local keyw, pat, val -- try global adjscore pat, val = string.match(line, '^adjscore%s+([%w%p]+)%s*=%s*([%d+-.]+)') if pat and val then return set_score_table(global_adjscore, pat, val) end -- try keyword specific adjscore keyw, pat, val = string.match(line, '^adjscore%(([%w%p]+)%)%s+([%w%p]+)%s*=%s*([%d+-.]+)') if keyw and pat and val then keyw = string.lower(keyw) spec_adjscore[keyw] = spec_adjscore[keyw] or {} return set_score_table(spec_adjscore[keyw], pat, val) end return false end ---------------------------- score computation ----------------------------- -- parse filename into , , local function parse(filename) local base, lang, ext ext = texdoc.util.get_ext(filename) if ext ~= nil and ext ~= '' then base = filename:sub(1, -#ext - 2) else base = filename end for lc, _ in pairs(texdoc.const.lang_codes) do local hyph_lc = '-' .. lc if base:sub(-#hyph_lc) == hyph_lc then return base:sub(1, -#hyph_lc - 1), lc, ext end end return base, lang, ext end -- says if pat is a "subword" of str local function is_subword(str, pat) local function is_delim(str, i) return not not string.find(string.sub(str, i, i), '%p') end local i, j = string.find(str, pat, 1, true) return not not (i and j and (i == 1 or is_delim(str, i) or is_delim(str, i-1)) and (j == #str or is_delim(str, j) or is_delim(str, j+1))) end -- says if a filename has a bad basename local function has_bad_basename(file) file = file:gsub('.*/', '') for _, b in ipairs(texdoc.config.get_value('badbasename_list')) do if file:find('^' .. b .. '$') or file:find('^' .. b .. '%.') then return true end end return false end -- compute a pattern score -10 <= s < 10 local function pattern_score(name, pat, dbg_score) dbg_score('Start heuristic scoring with pattern: ' .. pat) -- score management local score = -10 local function upscore(s, reason, force) if s > score or force then score = s dbg_score('New heuristic score: %.1f. Reason: %s', s, reason) end end -- look for exact or subword match if M.is_exact(name, pat) then upscore(4, 'exact match') elseif is_subword(name, pat) then upscore(1, 'subword match') end -- try derivatives unless pat contains a slash local slash = not not string.find(pat, '/', 1, true) if not slash then for _, suffix in ipairs(texdoc.config.get_value('suffix_list')) do local deriv = pat .. suffix if M.is_exact(name, deriv) then upscore(4.5, 'exact match for derived pattern: ' .. deriv) elseif is_subword(name, deriv) then upscore(3.5, 'subword match for derived pattern: ' .. deriv) end end end -- if extension is bad, score becomes an epsilon local ext = texdoc.config.get_value('ext_list')[M.ext_pos(name)] if ext and texdoc.config.get_value('badext_list_inv')[ext] and score > 0 then upscore(0.1, 'bad extension', true) end -- if basename is bad, score becomes an epsilon if has_bad_basename(name) and score > 0 then upscore(0.1, 'bad basename', true) end -- bonus for being in the right directory if string.find('/' .. name, '/' .. pat .. '/', 1, true) and not slash then upscore(score + 1.5, 'directory bonus') end -- done dbg_score('Final heuristic score: %.1f', score) return score end -- set the score of a docfile local function set_score(df, original_kw) -- scoring is case-insensitive (patterns are already lowercased) local name = string.lower(df.normname) local df_id = string.sub(md5.sumhexa(name), 1, 7) -- special debugging function local function dbg_score(msg, ...) -- add the hash id prefix to make the outputs grep-friendly local msg = string.format('(%s) ', df_id) .. msg texdoc.util.dbg_print('score', msg, ...) end dbg_score('Start scoring ' .. df.realpath) dbg_score('Name used: ' .. name) -- get score from patterns local score = -10 local is_alias = false for _, pat in ipairs(df.matches) do local s = -10 local p = string.lower(pat.name) if pat.original then -- non-alias if df.tree > -1 then s = pattern_score(name, p, dbg_score) else s = 1 end elseif M.is_exact(name, p) then -- alias is_alias = true local bonus, note = 0, '' if pat.locale then bonus, note = 5, ', (language-based)' end s = (pat.score or 10) + bonus -- default alias score is 10 dbg_score('Matching alias "%s", score: %.1f%s', pat.name, s, note) end if s > score then score = s end end dbg_score('Max pattern score: %.1f', score) -- get score from tlp associations if score == -10 and df.tlptodoc then score = -1 dbg_score('New score: %.1f from package name association', score) end if score == -10 and df.runtodoc then score = -5 dbg_score('New score: %.1f from sty/cls association', score) end -- bonus for metadata if df.details then if string.find(string.lower(df.details), 'readme') then score = score + 0.1 dbg_score('Catalogue "readme" bonus: +0.1') else score = score + 1.5 dbg_score('Catalogue details bonus: +1.5') end end -- bonus for locale local config_lang = texdoc.config.get_value('lang') if not is_alias then local file_lang -- from its catalogue if df.lang then -- take first two letters; it may have country codes file_lang = df.lang:sub(1, 2) end -- from its filename if not file_lang then _, file_lang, _ = parse(name) file_lang = texdoc.const.lang_codes[file_lang] end if config_lang ~= nil and config_lang == file_lang then score = score + 1 dbg_score('Locale match bonus: +1.0') elseif file_lang ~= nil and file_lang ~= 'en' then -- normally, english documents do not have file_lang, -- but sometimes catalogue includes en info (e.g., geometry) -- we want to treat both cases similar score = score - 0.1 dbg_score('Locale unmatch: -0.1') end end -- adjust from keyword-specific tables if df.tree > -1 and spec_adjscore[original_kw] then for pat, val in pairs(spec_adjscore[original_kw]) do if val and is_subword('/' .. name, pat) then score = score + val dbg_score('Adjust by %.1f from specific pattern "%s"', val, pat) end end end -- adjust from global tables if df.tree > -1 then for pat, val in pairs(global_adjscore) do if val and is_subword('/' .. name, pat) then if score > -10 or val < 0 then score = score + val end dbg_score('Adjust by %.1f from global pattern "%s"', val, pat) end end end dbg_score('Final score: %.1f', score) -- the final score should be a float value df.score = score + 0.0 end -- set the scores for a doclist local function set_list_scores(list, original_kw) for _, df in ipairs(list) do set_score(df, original_kw) end end -- says if filename is an exact match for pat function M.is_exact(filename, pattern) local f_base, f_lang, f_ext = parse(filename) local p_base, p_lang, p_ext = parse(pattern) -- if the pattern contains lang, check if identical if p_lang ~= nil and f_lang ~= p_lang then return false end -- if the pattern contains ext, check if identical if p_ext ~= nil and p_ext ~= '' and f_ext ~= p_ext then return false end -- finally check the bases if (f_base == p_base or (f_base:sub(-#p_base) == p_base and f_base:sub(-#p_base - 1, -#p_base - 1) == '/')) then return true else return false end end -- compare two docfile's: (see texdoclib-search.tlu for structure) -- 1. by score -- 2. then by extensions (ordered as in ext_list), -- 3. then lexicographically by normname. -- 4. then by tree. -- return true if a is better than b local function docfile_order(a, b) if a.score > b.score then return true elseif a.score < b.score then return false elseif a.ext_pos < b.ext_pos then return true elseif a.ext_pos > b.ext_pos then return false elseif a.normname < b.normname then return true elseif a.normname > b.normname then return false else return (a.tree > b.tree) end end ----------------------------- public functions ----------------------------- -- returns the index of the most specific extension of file in ext_list, -- or config.ext_list_max + 1 function M.ext_pos(filename) -- remove zipext if applicable filename = texdoc.util.parse_zip(filename) -- now find the extension local p, e, pos, ext for p, e in ipairs(texdoc.config.get_value('ext_list')) do if (e == '*') and (ext == nil) then pos, ext = p, e elseif (e == '') and not filename:find('.', 1, true) then pos, ext = p, e elseif filename:sub(-e:len() - 1) == '.' .. e then if (ext == nil) or (ext == '*') or (e:len() > ext:len()) then pos, ext = p, e end end end return pos or (texdoc.config.get_value('ext_list_max') + 1) end -- return the "quality" of docfile function M.docfile_quality(df) if df.score > 0 then return 'good' elseif df.score > -100 then return 'bad' else return 'killed' end end -- sort a doclist function M.sort_doclist(dl, original_kw) dl:stop() set_list_scores(dl, original_kw) table.sort(dl, docfile_order) end return M -- vim: ft=lua: