Module:Internet Archive: Difference between revisions
From Thetacola Wiki
Jump to navigationJump to search
en>Amorymeltzer m (Changed protection level for "Module:Internet Archive": High-risk Lua module: Bump to TE-protection to match Template:Internet Archive author, which has ~ 17k transclusions ([Edit=Require template editor access] (indefinite) [Move=Require template editor access] (indefinite))) |
m (1 revision imported) |
(No difference)
|
Latest revision as of 00:22, 15 August 2022
File:Full-protection-shackle.svg | This module is subject to page protection. It is a highly visible module in use by a very large number of pages, or is substituted very frequently. Because vandalism or mistakes would affect many pages, and even trivial editing might cause substantial load on the servers, it is protected from editing. |
Usage[edit source]
There is currently 1 template that invokes this module, {{Internet Archive author}}
.
If future Lua scripts for Internet Archive are created (books, film, audio, etc), this Module would be a natural location to build.
--[[ For functions related to Internet Archive Notes: 1. Internet Archive runs Elasticsearch search engine as of 4 Nov 2015 2. Program flowchart: Break name down into number of words Build a base URL based on number of words (1,2,3,4,5+), use of sopt=t switch, and availability of birth-death dates If any words contain extended-ascii characters append extra code for wildcards based on sopt=t or w return finished URL 3. URL length should not exceed 2000 characters or it will break certain popular browsers 4. Wildcard (*) replacements should be avoided in the first letter of the first word, and with any single-letter words 5. Changing search formulations will have impacts on existing uses of the template and off-line tools which are optimized for these search recipes. ]] local p = {} --[[ For Template:Internet Archive author ]] function p.author(frame) local pframe = frame:getParent() local args = pframe.args local tname = "Internet Archive author" -- name of calling template. Change if template rename. local name = nil -- article name (default: current page name) dname = nil -- display name (default: current page name) local sname = nil -- search name (default: current page name) local sopt = nil -- search options (default: nil) byabout = "Works by or about" tagline = "at [[Internet Archive]]" urlhead = "https://archive.org/search.php?query=" mydate = "" -- birth-death date --- Determine name name = trimArg(args.name) -- When using template outside main article space, the 'name' parameter is required (not optional) if not name then name = mw.title.getCurrentTitle().text end dname = mw.ustring.gsub(name,'%s+%([^%(]-%)$', '') -- Remove the final disambig parentheses sname = dname if trimArg(args.sname) then sname = trimArg(args.sname) end if trimArg(args.dname) then dname = trimArg(args.dname) end --- Determine search option sopt = trimArg(args.sopt) if sopt then sopt = mw.ustring.lower(sopt) if sopt == "tight" then sopt = "t" end if sopt == "tightx" then sopt = "tx" end if sopt == "wild" then sopt = "w" end if sopt ~= "t" and sopt ~= "tx" and sopt ~= "w" then sopt = "unknown" end end --- Determine tagline if trimArg(args.coda) then tagline = tagline .. " " .. trimArg(args.coda) end --- Custom search. Do early to avoid unnecessary processing. if trimArg(args.search) then local search = p.ia_url_encode(trimArg(args.search)) return "[" .. urlhead .. search .. " " .. byabout .. " " .. dname .. "] " .. tagline end -- Determine media string media = p.mediaTypes(args.media) if media == "" then mediaopen = "%28" -- added a default mediatype Dec 2015 see p.mediaTypes() else mediaopen = "%28" end -- Determine date of birth and death local temp = mw.text.split(p.bdDate(args.birth, args.death, name), " ") local birth = temp[1] local death = temp[2] if birth == "Error" or death == "Error" then return "Error in [[:Template:"..tname.."]]: [[" ..name.. "]] doesn't exist." end --- Split sname into words and count words local N = mw.text.split(sname, " ") local l, count = mw.ustring.gsub(sname, "%S+", "") if count == 0 then return "Error in [[:Template:"..tname.."]]: Zero-word name." end --- Date string if birth ~= "none" and death ~= "none" then if p.ia_extendedascii(N[count]) == 1 then mydate = "%20OR%20%28%22"..birth.."-"..death.."%22%20AND%20%28%22"..p.urlX(N[count]).."%22%20OR%20"..p.urlX(p.ia_deaccent(N[count])).."%29%29" else mydate = "%20OR%20%28%22"..birth.."-"..death.."%22%20AND%20"..p.urlX(N[count]).."%29" end end --- wild string wild = "%29" if sopt == "w" and p.ia_extendedascii(sname) == 1 then if p.wildcheck(N, count) == 1 then myurl = p.wildfix(N, count) return p.IArender() end if count < 3 or count > 3 then -- (first last) wild = "%20OR%20%28" .. p.ia_url_encode(p.ia_extendedascii2wildcard(sname)) .. "%29%29" end if count == 3 then -- (first last) wild = "%20OR%20%28" .. p.ia_url_encode(p.ia_extendedascii2wildcard(N[1])) .. "%20" .. p.ia_url_encode(p.ia_extendedascii2wildcard(N[3])) .. "%29%29" end end --[[ Format URL ]] if count == 1 then myurl = p.oneWord(sname) if sopt == "t" and p.ia_extendedascii(sname) == 1 then local plainname = p.ia_deaccent(sname) local A1 = "%20OR%20%22"..p.urlX(plainname) myurl = myurl .. A1 .. "%22" return p.IArender() end return p.IArender() end if count == 2 then myurl = p.twoWords(N, sopt) if sopt == "t" and p.ia_extendedascii(sname) == 1 then local plainname = p.ia_deaccent(sname) local PN = mw.text.split(plainname, " ") -- Last, First local A1 = "%20OR%20%22"..p.urlX(PN[2]).."%2C%20"..p.urlX(PN[1]) -- First Last local A2 = "%22%20OR%20%22"..p.urlX(PN[1]).."%20"..p.urlX(PN[2]) myurl = myurl .. A1 .. A2 .. "%22" return p.IArender() end return p.IArender() end if count == 3 then myurl = p.threeWords(N, sopt) if sopt == "t" and p.ia_extendedascii(sname) == 1 then local plainname = p.ia_deaccent(sname) local PN = mw.text.split(plainname, " ") local FIRST = p.urlX(PN[1]) local MIDDLE = p.urlX(PN[2]) local LAST = p.urlX(PN[3]) local firstinitialp = p.urlX( p.firstLetter(PN[1]) ) local middleinitialp = p.urlX( p.firstLetter(PN[2]) ) -- First Middle Last local A1 = "%20OR%20%22"..FIRST.."%20"..MIDDLE.."%20"..LAST -- Last, First Middle local A2 = "%22%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE -- Last, First M. local A3 = "%22%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitialp.."%2E" -- Last, F. M. local A4 = "%22%20OR%20%22"..LAST.."%2C%20"..firstinitialp..".%20"..middleinitialp.."%2E" local ALL = A1 .. A2 .. A3 .. A4 .. "%22" myurl = myurl .. ALL return p.IArender() end return p.IArender() end if count == 4 then myurl = p.fourWords(N, sopt) if sopt == "t" and p.ia_extendedascii(sname) == 1 then local plainname = p.ia_deaccent(sname) local PN = mw.text.split(plainname, " ") local FIRST = p.urlX(PN[1]) local SECOND = p.urlX(PN[2]) local THIRD = p.urlX(PN[3]) local LAST = p.urlX(PN[4]) local firstinitialp = p.urlX( p.firstLetter(PN[1]) ) local secondinitialp = p.urlX( p.firstLetter(PN[2]) ) local thirdinitialp = p.urlX( p.firstLetter(PN[3]) ) -- Last, First Second Third local A1 = "%20OR%20%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD -- First Second Third Last local A2 = "%22%20OR%20%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- Last, F. S. T. local A3 = "%22%20OR%20%22"..LAST.."%2C%20"..firstinitialp.."%2E%20"..secondinitialp.."%2E%20"..thirdinitialp.."%2E" local ALL = A1 .. A2 .. A3 .. "%22" myurl = myurl .. ALL return p.IArender() end return p.IArender() end if count > 4 then myurl = "" if sopt == "w" and p.ia_extendedascii(sname) == 1 then myurl = "%28" end myurl = myurl .. "%28" .. p.ia_url_encode(sname) if sopt == "w" and p.ia_extendedascii(sname) == 1 then myurl = myurl .. "%29" end if sopt == "t" and p.ia_extendedascii(sname) == 1 then local plainname = p.ia_deaccent(sname) local A1 = "%29%20OR%20%28"..p.ia_url_encode(plainname) myurl = myurl .. A1 return p.IArender() end return p.IArender() end return "Unknown error (1). Please check documentation for [[Template:"..tname.."]]" end -- Build final output and render function p.IArender() return "[" .. urlhead .. mediaopen .. myurl .. wild .. mydate .. media .. " " .. byabout .. " " .. dname .. "] " .. tagline end function p.oneWord(sname) local nameurl = p.ia_url_encode(sname) local A1 = "%28subject%3A%22"..nameurl local A2 = "%22%20OR%20creator%3A%22"..nameurl local A3 = "%22%20OR%20description%3A%22"..nameurl local A4 = "%22%20OR%20title%3A%22"..nameurl return A1 .. A2 .. A3 .. A4 .. "%22" end function p.twoWords(N, sopt) local FIRST = p.urlX(N[1]) local LAST = p.urlX(N[2]) local firstinitial = p.urlX( p.firstLetter(N[1]) ) -- Last, First local S1 = "%28subject%3A%22"..LAST.."%2C%20"..FIRST -- First Last local S2 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..LAST local SALL = S1..S2 -- Last, First local C1 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST -- First Last local C2 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..LAST local CALL = C1..C2 -- First Last local T1 = "%22%20OR%20title%3A%22"..FIRST.."%20"..LAST local TALL = T1 -- Last, First local D1 = "%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST -- First Last local D2 = "%22%20OR%20description%3A%22"..FIRST.."%20"..LAST local DALL = D1..D2 if sopt == "t" or sopt == "tx" then return SALL .. CALL .. TALL .. DALL .. "%22" else -- Last, F. local C3 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E" local CALL = CALL..C3 return SALL .. CALL .. TALL .. DALL .. "%22" end end function p.threeWords(N, sopt) -- CAUTION: The following is near the max 2000 character URL limit for most browsers when using long names -- such as "René-Nicolas Dufriche Desgenettes". local FIRST = p.urlX(N[1]) local MIDDLE = p.urlX(N[2]) local LAST = p.urlX(N[3]) local firstinitial = p.urlX( p.firstLetter(N[1]) ) local middleinitial = p.urlX( p.firstLetter(N[2]) ) -- Last, First Middle local S1 = "%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE -- Last, First M. local S2 = "%22%20OR%20subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E" -- Last, F. M. local S3 = "%22%20OR%20subject%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E" -- First Middle Last local S4 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST -- First M. Last local S5 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST -- F. M. Last local S6 = "%22%20OR%20subject%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST local SALL = S1..S2..S3..S4..S5..S6 -- First Middle Last local C1 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST -- First M. Last local C2 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST -- F. M. Last local C3 = "%22%20OR%20creator%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST -- F. Middle Last local C4 = "%22%20OR%20creator%3A%22"..firstinitial.."%2E%20"..MIDDLE.."%20"..LAST -- Last, First Middle local C5 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE -- Last, First M. local C6 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E" -- Last, F. M. local C7 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E" -- Last, F. M. local C8 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..MIDDLE local CALL = C1..C2..C3..C4..C5..C6..C7..C8 -- First Middle Last local T1 = "%22%20OR%20title%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST -- First M. Last local T2 = "%22%20OR%20title%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST -- F. M. Last local T3 = "%22%20OR%20title%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST local TALL = T1..T2..T3 -- First Middle Last local D1 = "%22%20OR%20description%3A%22"..FIRST.."%20"..MIDDLE.."%20"..LAST -- First M. Last local D2 = "%22%20OR%20description%3A%22"..FIRST.."%20"..middleinitial.."%2E%20"..LAST -- F. M. Last local D3 = "%22%20OR%20description%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..LAST -- Last, First Middle local D4 = "%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST.."%20"..MIDDLE -- Last, First M. local D5 = "%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST.."%20"..middleinitial.."%2E" local DALL = D1..D2..D3..D4..D5 if sopt == "t" or sopt == "tx" then return SALL .. CALL .. TALL .. DALL .. "%22" else -- Last, First local S7 = "%22%20OR%20subject%3A%22"..LAST.."%2C%20"..FIRST -- First Last local S8 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..LAST local SALL = SALL..S7..S8 -- First Last local C9 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..LAST -- Last, First local C10 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST local CALL = CALL..C9..C10 -- First Last local T4 = "%22%20OR%20title%3A%22"..FIRST.."%20"..LAST local TALL = TALL..T4 -- First Last local D6 = "%22%20OR%20description%3A%22"..FIRST.."%20"..LAST -- Last, First local D7 = "%22%20OR%20description%3A%22"..LAST.."%2C%20"..FIRST local DALL = DALL..D6..D7 return SALL .. CALL .. TALL .. DALL .. "%22" end end function p.fourWords(N, sopt) local FIRST = p.urlX(N[1]) local SECOND = p.urlX(N[2]) local THIRD = p.urlX(N[3]) local LAST = p.urlX(N[4]) local firstinitial = p.firstLetter(N[1]) local secondinitial = p.firstLetter(N[2]) local thirdinitial = p.firstLetter(N[3]) if sopt == "t" or sopt == "tx" then -- Last, First Second Third local S1 = "%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD -- First Second Third Last local S2 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- Last, First Second Third local C1 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD -- First Second Third Last local C2 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- First Second Third Last local T1 = "%22%20OR%20title%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- First Second Third Last local D1 = "%22%20OR%20description%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST return S1..S2..C1..C2..T1..D1.."%22" end -- Last, First Second Third local S1 = "%28subject%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD -- First Second Third Last local S2 = "%22%20OR%20subject%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- Last, First Second Third local C1 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..FIRST.."%20"..SECOND.."%20"..THIRD -- First Second Third Last local C2 = "%22%20OR%20creator%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- Last, F. S. T. local C3 = "%22%20OR%20creator%3A%22"..LAST.."%2C%20"..firstinitial.."%2E%20"..secondinitial.."%2E%20"..thirdinitial.."%2E" -- First Second Third Last local T1 = "%22%20OR%20title%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST -- First Second Third Last local D1 = "%22%20OR%20description%3A%22"..FIRST.."%20"..SECOND.."%20"..THIRD.."%20"..LAST return S1..S2..C1..C2..C3..T1..D1.."%22" end -- ElasticSearch speed/resource problems if first letter of first word is "*" wildcard ie. accented letter -- Build special search in these cases. -- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_wildcards function p.wildfix(N, count) --- Split along "-" and use only first word ie. John-Taylor-Smith becomes John local NF = mw.text.split(N[1], "-") local NL = mw.text.split(N[count], "-") -- ..but use full name for 1-word names if count == 1 then NF[1] = N[1] NL[1] = N[1] end -- ((Fïrst OR First) AND (Lást OR Last)) return "%28%28%22" .. NF[1] .. "%22%20OR%20" .. p.ia_deaccent(NF[1]) .. "%29%20AND%20%28%22" .. NL[1] .. "%22%20OR%20" .. p.ia_deaccent(NL[1]) .. "%29" end -- Return 1 if the first letter of first word, or any single-letter word, is extended ascii function p.wildcheck(N, count) local i = 0 -- first letter of first word is extended ascii if N[1]:byte(1) < 32 or N[1]:byte(1) > 126 then return 1 end -- any single-letter word that is composed of only extended ascii while i < count do i = i + 1 if N[i]:len() == 1 then if N[i]:byte(1) < 32 or N[i]:byte(1) > 126 then return 1 end end end return 0 end function trimArg(arg) if arg == "" or arg == nil then return nil else return mw.text.trim(arg) end end function p.mediaTypes(argsmedia) -- Added a default mediatype Dec 2015 due to too many false positives in the software mediatype, caused by birth-death dates catching numbers in source codes local media = "-mediatype:software" if argsmedia ~="" and argsmedia ~=nil then local medialist = mw.text.split(mw.text.trim(argsmedia), " ") local al, acount = mw.ustring.gsub(mw.text.trim(argsmedia), "%S+", "") local i = 0 repeat -- the following could be condensed but repetitive for clarity i = i + 1 if(mw.ustring.lower(medialist[i]) == "text" or mw.ustring.lower(medialist[i]) == "texts") then media = media .. p.ia_url_encode(" OR mediatype:texts") end if(mw.ustring.lower(medialist[i]) == "audio") then media = media .. p.ia_url_encode(" OR mediatype:audio") end if(mw.ustring.lower(medialist[i]) == "video") then media = media .. p.ia_url_encode(" OR mediatype:video") end until i == acount end media = "%29%20AND%20%28" .. media .. "%29" return media end -- Alt way to get b/d dates via getContent() function p.bdDateAlt(argsbirth, argsdeath, name) local pagetext = nil local birth = "none" local death = "none" -- Load the page local t = mw.title.new(name) if(t.exists) then pagetext = t:getContent() end if pagetext == nil then return "Error" end -- Remove false positives pagetext = mw.ustring.gsub( mw.ustring.gsub(pagetext, "<!--.--->", ""), "<nowiki>.-</nowiki>", "") -- "Category:1900 births" if argsbirth == "" or argsbirth == nil then local birthcheck = mw.ustring.match(pagetext, "%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-births%s-%]%]" ) if birthcheck ~= nil then birth = mw.ustring.match(birthcheck, "%d+%.?%d*") else birth = "none" end else birth = mw.text.trim(argsbirth) end -- "Category:2000 deaths" if argsdeath == "" or argsdeath == nil then local deathcheck = mw.ustring.match(pagetext, "%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-deaths%s-%]%]" ) if deathcheck ~= nil then death = mw.ustring.match(deathcheck, "%d+%.?%d*") else death = "none" end else death = mw.text.trim(argsdeath) end return birth .. " " .. death end -- Get b/d dates via Wikidata. -- function p.bdDate(argsbirth, argsdeath, name) local pagetext = nil local birth = "none" local death = "none" entity = mw.wikibase.getEntityObject() if not entity or not entity.claims then -- Alternative if template not on a page in mainspace. This is needed since Wikidata can only be retrieved -- for the article where the template is located. return p.bdDateAlt(argsbirth, argsdeath, name) end -- Note: The below uses formatPropertyValues() to get and format the date from Wikidata. -- For an alternative method, see sandbox revision dated 5:58 am, 15 October 2014 if argsbirth == "" or argsbirth == nil then local birthtable = entity:formatPropertyValues( 'P569' ) local birthsplit = mw.text.split(birthtable["value"], " ") local l, count = mw.ustring.gsub(birthtable["value"], "%S+", "") if count > 0 then if string.find(birthsplit[count], "^%d") then birth = birthsplit[count] elseif string.find(birthsplit[count], "BCE") then birth = birthsplit[count - 1] elseif string.find(birthsplit[count], "BC") then birth = birthsplit[count - 1] elseif string.find(birthsplit[count], "AD") then birth = birthsplit[count - 1] end end else birth = mw.text.trim(argsbirth) end if argsdeath == "" or argsdeath == nil then local deathtable = entity:formatPropertyValues( 'P570' ) local deathsplit = mw.text.split(deathtable["value"], " ") local l, count = mw.ustring.gsub(deathtable["value"], "%S+", "") if count > 0 then if string.find(deathsplit[count], "^%d") then death = deathsplit[count] elseif string.find(deathsplit[count], "BCE") then death = deathsplit[count - 1] elseif string.find(deathsplit[count], "BC") then death = deathsplit[count - 1] elseif string.find(deathsplit[count], "AD") then death = deathsplit[count - 1] end end else death = mw.text.trim(argsdeath) end if birth == "none" and death == "none" then -- Alternative if Wikidata is missing data -- return p.bdDateAlt(name) return birth .. " " .. death else return birth .. " " .. death end end --- URL-encode special characters --- Note: this function was added later to deal with "&" characters instead of using p.ia_url_encode since --- that may break existing instances of the template. function p.urlX(str) if (str) then str = mw.ustring.gsub (str, "&", "%%26") end return str end --- URL-encode a string --- http://lua-users.org/wiki/StringRecipes --- function p.ia_url_encode(str) if (str) then str = mw.ustring.gsub (str, "\n", "\r\n") str = mw.ustring.gsub (str, "([^%w %-%_%.%~])", function (c) return mw.ustring.format ("%%%02X", string.byte(c)) end) str = mw.ustring.gsub (str, " ", "+") end return str end -- Does str contain extended ascii? 1 = yes function p.ia_extendedascii(str) for i = 1, str:len() do if (str:byte(i) >= 32 and str:byte(i) <= 126) and str:byte(i) ~= 39 then -- 39 = "'" --do nothing else return 1 end end return 0 end -- UTF-8 aware replacement for string.sub() which doesn't support UTF-8. -- Note: Using instead of mw.ustring.sub() which I suspect(?) might be cause of intermittent error, and faster here for first-letter job. -- Source: prapin @ Stack Overflow http://stackoverflow.com/questions/13235091/extract-the-first-letter-of-a-utf-8-string-with-lua function p.firstLetter(str) return str:match("[%z\1-\127\194-\244][\128-\191]*") end -- Replace all extended ascii characters with wildcard '*' -- Replace "-" with <space> eg. Pierre-Jean -> Pierre Jean function p.ia_extendedascii2wildcard(str) local s = "" local j = 0 local k = 0 for i = 1, str:len() do k = str:byte(i) if k >= 32 and k <= 126 then -- For list of Lucene special characters needing to be escaped: -- http://lucene.apache.org/core/4_10_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters -- We only worry about - (45) and " (34) since the others are unlikely to appear in a proper name. -- Also ' (39) since it is sometimes the extended character ’ if k == 34 or k == 39 then s = s .. "*" elseif k == 45 then s = s .. " " else s = s .. str:sub(i,i) end else if j == 1 then s = s .. "*" j = 2 end if j == 0 then j = 1 end if j == 2 then j = 0 end end end return s end -- Replace accented letters with non-accented equivalent letters -- Note: this is not a complete list of all possible accented letters. It is -- all of the accented letters found in the first 10,000 names using -- the Internet Archive author template. function p.ia_deaccent(str) local s = str s = mw.ustring.gsub(s, "á", "a") s = mw.ustring.gsub(s, "a︡", "a") s = mw.ustring.gsub(s, "Á", "A") s = mw.ustring.gsub(s, "ă", "a") s = mw.ustring.gsub(s, "â", "a") s = mw.ustring.gsub(s, "æ", "ae") s = mw.ustring.gsub(s, "Æ", "AE") s = mw.ustring.gsub(s, "à", "a") s = mw.ustring.gsub(s, "ā", "a") s = mw.ustring.gsub(s, "Ā", "A") s = mw.ustring.gsub(s, "ą", "a") s = mw.ustring.gsub(s, "å", "a") s = mw.ustring.gsub(s, "Å", "A") s = mw.ustring.gsub(s, "ã", "a") s = mw.ustring.gsub(s, "ä", "a") s = mw.ustring.gsub(s, "Ä", "A") s = mw.ustring.gsub(s, "β", "B") s = mw.ustring.gsub(s, "ć", "c") s = mw.ustring.gsub(s, "č", "c") s = mw.ustring.gsub(s, "Č", "C") s = mw.ustring.gsub(s, "ç", "c") s = mw.ustring.gsub(s, "Ç", "C") s = mw.ustring.gsub(s, "ĉ", "c") s = mw.ustring.gsub(s, "ď", "d") s = mw.ustring.gsub(s, "đ", "d") s = mw.ustring.gsub(s, "é", "e") s = mw.ustring.gsub(s, "É", "E") s = mw.ustring.gsub(s, "ě", "e") s = mw.ustring.gsub(s, "ê", "e") s = mw.ustring.gsub(s, "è", "e") s = mw.ustring.gsub(s, "È", "E") s = mw.ustring.gsub(s, "ε", "e") s = mw.ustring.gsub(s, "ē", "e") s = mw.ustring.gsub(s, "Ē", "E") s = mw.ustring.gsub(s, "ę", "e") s = mw.ustring.gsub(s, "ð", "e") s = mw.ustring.gsub(s, "ë", "e") s = mw.ustring.gsub(s, "Ë", "E") s = mw.ustring.gsub(s, "γ", "Y") s = mw.ustring.gsub(s, "ħ", "h") s = mw.ustring.gsub(s, "i︠a︡", "ia") s = mw.ustring.gsub(s, "í", "i") s = mw.ustring.gsub(s, "i︠", "i") s = mw.ustring.gsub(s, "ĭ", "i") s = mw.ustring.gsub(s, "Í", "I") s = mw.ustring.gsub(s, "î", "i") s = mw.ustring.gsub(s, "Î", "I") s = mw.ustring.gsub(s, "ì", "i") s = mw.ustring.gsub(s, "ī", "i") s = mw.ustring.gsub(s, "ł", "i") s = mw.ustring.gsub(s, "ï", "i") s = mw.ustring.gsub(s, "Ï", "I") s = mw.ustring.gsub(s, "ĺ", "I") s = mw.ustring.gsub(s, "Ĺ", "L") s = mw.ustring.gsub(s, "μ", "u") s = mw.ustring.gsub(s, "µ", "u") s = mw.ustring.gsub(s, "ń", "n") s = mw.ustring.gsub(s, "ň", "n") s = mw.ustring.gsub(s, "ņ", "n") s = mw.ustring.gsub(s, "ñ", "n") s = mw.ustring.gsub(s, "Ñ", "N") s = mw.ustring.gsub(s, "ó", "o") s = mw.ustring.gsub(s, "Ó", "O") s = mw.ustring.gsub(s, "ô", "o") s = mw.ustring.gsub(s, "œ", "oe") s = mw.ustring.gsub(s, "ò", "o") s = mw.ustring.gsub(s, "ō", "o") s = mw.ustring.gsub(s, "ø", "o") s = mw.ustring.gsub(s, "Ø", "o") s = mw.ustring.gsub(s, "õ", "o") s = mw.ustring.gsub(s, "ö", "o") s = mw.ustring.gsub(s, "ő", "o") s = mw.ustring.gsub(s, "Ö", "O") s = mw.ustring.gsub(s, "φ", "o") s = mw.ustring.gsub(s, "ŕ", "r") s = mw.ustring.gsub(s, "ř", "r") s = mw.ustring.gsub(s, "Ř", "R") s = mw.ustring.gsub(s, "ß", "ss") s = mw.ustring.gsub(s, "ś", "s") s = mw.ustring.gsub(s, "Ś", "S") s = mw.ustring.gsub(s, "š", "s") s = mw.ustring.gsub(s, "ṣ", "s") s = mw.ustring.gsub(s, "Š", "S") s = mw.ustring.gsub(s, "ş", "s") s = mw.ustring.gsub(s, "Ş", "S") s = mw.ustring.gsub(s, "ŝ", "s") s = mw.ustring.gsub(s, "σ", "s") s = mw.ustring.gsub(s, "ť", "t") s = mw.ustring.gsub(s, "ţ", "t") s = mw.ustring.gsub(s, "τ", "t") s = mw.ustring.gsub(s, "þ", "p") s = mw.ustring.gsub(s, "Þ", "p") s = mw.ustring.gsub(s, "ú", "u") s = mw.ustring.gsub(s, "Ú", "U") s = mw.ustring.gsub(s, "û", "u") s = mw.ustring.gsub(s, "ù", "u") s = mw.ustring.gsub(s, "ū", "u") s = mw.ustring.gsub(s, "ů", "u") s = mw.ustring.gsub(s, "ü", "u") s = mw.ustring.gsub(s, "Ü", "U") s = mw.ustring.gsub(s, "ŵ", "w") s = mw.ustring.gsub(s, "ý", "y") s = mw.ustring.gsub(s, "ŷ", "y") s = mw.ustring.gsub(s, "¥", "y") s = mw.ustring.gsub(s, "ÿ", "y") s = mw.ustring.gsub(s, "Ÿ", "Y") s = mw.ustring.gsub(s, "ź", "z") s = mw.ustring.gsub(s, "Ž", "Z") s = mw.ustring.gsub(s, "ž", "z") s = mw.ustring.gsub(s, "ż", "z") s = mw.ustring.gsub(s, "Ż", "Z") return s end return p