Sample skeleton code for WP:URLREQ move requests. This is the "easy" version for straight-forward moves.
urlchanger-skeleton-easy.nim
discard """
The MIT License (MIT)
Copyright (c) 2016-2021 by User:GreenC (at en.wikipedia.org)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE."""
# Search on "CUSTOM" for project-specific code
var
ReoldA = "old[.]com"
ReoldB = "old.com"
RenewA = "new[.]com"
RenewB = "new.com"
Reold1 = "(?i)https?[:][/]{2}(([^.]+)[.])?" & ReoldA
Reold2 = "http://" & ReoldB
Reold3 = "http://www." & ReoldB
Reold4 = "(?i)(www[.])?" & ReoldA
Repr1 = "(?i)url[ ]*[=][ ]*[/]{2}" & ReoldA
Repr2 = "(?i)url[ ]*[=][ ]*[/]{2}www[.]" & ReoldA
Repr3 = "(?i)[[][ ]*[/]{2}" & ReoldA
Repr4 = "(?i)[[][ ]*[/]{2}www[.]" & ReoldA
Renew1 = "https://" & RenewB
Renew2 = "https[:][/]{2}" & RenewA
Renew3 = "(?i)https?[:][/]{2}(([^.]+)[.])?" & RenewA
Renew4 = "(?i)(www[.])?" & RenewA
Renew5 = RenewB # base domain used for <ref name="new.com">
#
# Custom version of headerlocation() in medicapi.nim
# For cases like https://dcms.lds.org/delivery/DeliveryManagerServlet?from=fhd&dps_pid=IE1170338
# If Location doesn't have a domiain name, use the domain from the first Location
#
proc headerlocation_urlchanger*(head: string, fl: varargs[string]): string =
var
mcache = newSeq[string](0)
c, f, le: int
flag, flag2, flag3, firstlocation = ""
firstlocationtrap = false
if len(fl) == 1:
flag = fl[0]
if len(fl) == 2:
flag = fl[0]
flag2 = fl[1]
if len(fl) == 3:
flag = fl[0]
flag2 = fl[1]
flag3 = fl[2]
c = awk.split(head, a, "\n")
for i in 0..c - 1:
if a[i] ~ "(?i)^[ ]{0,5}location[ ]?[:]":
if not empty(flag): # get URLs
awk.sub("(?i)^[ ]*location[ ]*[:][ ]*", "", a[i])
if not firstlocationtrap and a[i] ~ "^http": # get scheme+hostname of first Location: entry
firstlocationtrap = true
firstlocation = uriparseElement(a[i], "scheme")
firstlocation = firstlocation & "://" & uriparseElement(a[i], "hostname")
if a[i] !~ "^http": # If last Location: has no scheme+hostname then tack it on from the first Location:
if not empty(flag3): # Otherwise use the scheme+hostname in flag3
a[i] = flag3 & a[i]
else:
if firstlocation ~ "^http":
a[i] = firstlocation & a[i]
else:
return ""
if empty(flag2):
if isarchiveorg(a[i]):
mcache.add(strip(a[i]))
else:
mcache.add(strip(a[i]))
else: # get timestamps
if awk.split(strip(a[i]), b, " ") > 1:
f = awk.split(b[1], e, "/")
for k in 0..f-1:
if e[k] ~ "^[0-9]{14}$":
mcache.add(e[k])
break
le = len(mcache)
if le > 0:
if len(mcache[le - 1]) > 0: # Get the last HTTP response
return mcache[le - 1]
#
# Return DEADLINK unless cite template is of type defined by skiptemplate
#
template checklinkredir_helper(tl, skiptemplate: string) =
if empty(skiptemplate) or tl !~ skiptemplate:
return "DEADLINK"
return "SKIPDEADLINK"
#
# Follow a link to its redirect and return ultimate source.
#
# . Return new url if it can find one
# . Return "" it can't find a redirect. Add an archive if url returns 404, otherwise if 200 leave untouched
# . Return "DEADLINK" it can't find a redirect. Force adding archive regardless of url status. Useful if redirect is known homepage for example.
# . Return "SKIPDEADLINK" it can't find a redirect. Do not add an archive no matter what.
#
proc checklinkredir*(url, tl: string): string =
result = ""
var
url = url
# CUSTOM
skiptemplate = "(?i)[{]{2}[ ]*album[ -]?chart" # Skip adding new archives for these templates or set to blank if none
newurl = ""
headres: int
# CUSTOM
fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*"
if awk.match(url, fullurl, dest) > 0:
#se("URL0 = " & url)
#se("DEST0 = " & dest)
# CUSTOM
newurl = dest
gsub(Reold1, Renew1, newurl) # "(?i)https?[:][/]{2}(([^.]+)[.])?old[.]com[.]", "https://new.com"
if(newurl ~ Renew2): # "https[:][/]{2}new[.]com"
var (head, bodyfilename) = getheadbody(newurl)
bodyfilename = "" # supress compile warn
headres = headerresponse(head)
if headres == 200: # OK
return newurl
elif headres == 404 or headres == -1: # Dead
checklinkredir_helper(tl, skiptemplate)
elif headres == 301 or headres == 302: # Redirect
var redirurl = headerlocation_urlchanger(head)
sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect found: check it out ---- urlchanger7.1")
if not empty(redirurl):
var (head2, bodyfilename2) = getheadbody(redirurl)
bodyfilename2 = "" # supress compile warn
if headerresponse(head2) == 200:
return redirurl
elif headerresponse(head2) == 404:
checklinkredir_helper(tl, skiptemplate)
else:
sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect not working - aborting ---- urlchanger7.2")
return "SKIPDEADLINK"
else:
sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect not working - aborting ---- urlchanger7.5")
return "SKIPDEADLINK"
elif headres == 443 or headres == 500: # Forbidden
checklinkredir_helper(tl, skiptemplate)
else:
sendlog(Project.urlchanger, CL.name, url & " ---- Unknown response code - aborting ---- urlchanger7.3")
return "SKIPDEADLINK"
else:
sendlog(Project.urlchanger, CL.name, url & " ---- Unknown problem: check it out ---- urlchanger7.4")
checklinkredir_helper(tl, skiptemplate)
if tl !~ skiptemplate:
return ""
else:
return "SKIPDEADLINK"
#
# Last step whole article check and log missing cases
#
proc checklinkexists(): string {.discardable} =
if Runme.urlchanger != true:
return
var
fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*"
psplit(GX.articlework, fullurl, p):
# skip archives and cite templates, imperfect method due to duplicates
if awk.match(GX.articlework, "([/]|[?]url[=])https?" & escapeRe(gsubi("^https?", "", p.field[i])) ) == 0 and awk.match(GX.articlework, escapeRe(p.field[i]) & GX.space & GX.webarchive) == 0:
sendlog(Project.urlchanger, CL.name, p.field[i] & " ---- Link wasn't converted: check it out ---- checklinkexists1.1")
#
# Replace given domain with an archive.org/web/1899..
#
proc urlchanger(): bool {.discardable.} =
if Runme.urlchanger != true:
return false
var
url,res,archiveurl,webarchive,sourceurl,title,head,bodyfilename,fpHTML,prurl,urltype = ""
tot = 0
fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*"
# CUSTOM
addarchive = true # if true then it will add archive URLs if link is dead
psplit(GX.articlework, Repr1, p): # "(?i)url[ ]*[=][ ]*[/]{2}old[.]com"
p.field[i] = "url = " & Reold2 # "http://old.com"
inc(p.ok)
psplit(GX.articlework, Repr2, p): # "(?i)url[ ]*[=][ ]*[/]{2}www[.]old[.]com"
p.field[i] = "url = " & Reold3 # "http://www.old.com"
inc(p.ok)
psplit(GX.articlework, Repr3, p): # "(?i)[[][ ]*[/]{2}old[.]com"
p.field[i] = "[" & Reold2 # "http://old.com"
inc(p.ok)
psplit(GX.articlework, Repr4, p): # "(?i)[[][ ]*[/]{2}www[.]old[.]com"
p.field[i] = "[" & Reold3 # "http://www.old.com"
inc(p.ok)
# Convert cases like:
# ">http://www.highbeam.com/doc/1G1-9343909.html"
# "#http://www.highbeam.com/doc/1G1-9343909.html"
# "*http://www.highbeam.com/doc/1G1-9343909.html"
psplit(GX.articlework, "[>#*]{1}[ ]*" & fullurl, p):
if awk.match(p.field[i], "^[>#*]{1}[ ]*", dest1) > 0:
if awk.match(p.field[i], fullurl, dest2) > 0:
p.field[i] = dest1 & "[" & dest2 & " " & Runme.urlchangerTag & "]"
sed("Converting bare to bracket: " & p.field[i], Debug.network)
sendlog(Project.urlchanger, CL.name, p.field[i] & " ---- convert barelink to bracket ---- urlchanger0.1")
inc(p.ok)
inc(tot)
# Replace in {{cite web |url}} ({{dead}}{{cbignore}})?
# CUSTOM template additions
var citelist3 = GX.citelist & "|album[ -]?chart"
var cite3 = "(?i)([{][{][ ]*(" & citelist3 & ")[^}]+}})"
psplit(GX.articlework, cite3 & "[ ]*(" & GX.dead & "[ ]*(" & GX.cbignore & ")?)?", p):
url = ""
urltype = ""
# find url, otherwise try alternatives like chapter-url etc..
prurl = getarg("url", "clean", p.field[i])
if prurl ~ fullurl:
urltype = "url"
url = prurl
else:
awk.split("chapter-url contribution-url entry-url article-url section-url map-url conference-url transcript-url lay-url", a, " ")
for k in 0..len(a) - 1:
if isarg(a[k], "exists", p.field[i]):
prurl = getarg(a[k], "clean", p.field[i])
if prurl ~ fullurl:
urltype = a[k]
url = prurl
break
if url ~ fullurl:
gsub("[#]$", "", url)
res = checklinkredir(url, p.field[i])
if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(getarg("archive-url", "clean", p.field[i]))):
if isarg(urltype, "exists", p.field[i]): # swap in new URL
p.field[i] = replacearg(p.field[i], urltype, res, "urlchanger1.1")
if isarg("archive-url", "exists", p.field[i]): # move archive URL
var tup: tuple[url: string, status: int, response: int]
tup = queryapiget(res, urltimestamp(getarg("archive-url", "clean", p.field[i])) )
if tup.status == 1:
# p.field[i] = replacearg(p.field[i], "archive-url", "https://web.archive.org/web/18990101080101/" & res, "urlchanger1.1a")
p.field[i] = replacearg(p.field[i], "archive-url", tup.url, "urlchanger1.1a")
if isarg("url-status", "exists", p.field[i]):
p.field[i] = replacearg(p.field[i], "url-status", "live", "urlchanger1.1b")
else:
sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger1.6")
# awk.split("archive-url archive-date url-status", a, " ") # delete existing archives
# for k in 0..len(a) - 1:
# if isarg(a[k], "exists", p.field[i]):
# p.field[i] = gsubs(getarg(a[k], "bar", p.field[i]), "", p.field[i])
# if a[k] ~ "archive-url":
# sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- removed archive ---- urlchanger1.6")
gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i])
gsub(GX.dead, "", p.field[i])
p.ok += inclog("urlchanger1.1", GX.esurlchange, Project.syslog, url & " ---- " & res)
inc(tot)
else: # add archive if url= is dead
if addarchive and urltype == "url" and res != "SKIPDEADLINK":
if res != "DEADLINK":
(head, bodyfilename) = getheadbody(url, "one") # check the orginal URL is dead
if headerresponse(head) != 200 or res == "DEADLINK":
gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i])
gsub(GX.dead, "", p.field[i])
archiveurl = getarg("archive-url", "clean", p.field[i])
if empty(archiveurl):
p.field[i] = replacearg(p.field[i], "url", "https://web.archive.org/web/18990101080101/" & url, "urlchanger1.1")
sed("Converting to 1899 (1): " & p.field[i], Debug.network)
inc(p.ok)
inc(tot)
else: # Add/modify |url-status=dead
if isarg("url-status", "missing" , p.field[i]):
if isarg("url", "exists", p.field[i]):
addarg("url-status", "dead", "archive-url", p.field[i]):
p.ok += inclog("urlchanger1.2", GX.esurlchange, Project.urlchanger, url & " ---- add url-status status")
inc(tot)
# modelbar = getarg(firstarg(p.field[i]), "bar", p.field[i])
# locbar = getarg(notlastarg(p.field[i], "archive-url"), "bar", p.field[i])
# if not empty(modelbar):
# if not empty(modelfield(modelbar, "url-status", "dead")):
# gsubs(locbar, locbar & modelfield(modelbar, "url-status", "dead"), p.field[i])
# p.ok += inclog("urlchanger1.2", GX.esurlchange, Project.urlchanger, url & " ---- add url-status status")
# inc(tot)
else:
if getarg("url-status", "clean", p.field[i]) !~ "(?i)dead":
p.field[i] = replacearg(p.field[i], "url-status", "dead", "urlchanger1.2")
p.ok += inclog("urlchanger1.3", GX.esurlchange, Project.urlchanger, url & " ---- modify url-status status")
inc(tot)
# replace [state.gov] {{webarchive}}
psplit(GX.articlework, "[[][ ]*" & fullurl & "[^]]*[]][ ]*" & GX.webarchive, p):
if awk.match(p.field[i], GX.webarchive, webarchive) > 0 and awk.match(p.field[i], fullurl, url) > 0:
res = checklinkredir(url, p.field[i])
if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(getarg("url", "clean", webarchive))):
var tup: tuple[url: string, status: int, response: int]
tup = queryapiget(res, urltimestamp(getarg("url", "clean", webarchive)) )
if tup.status == 1:
let orig = webarchive
webarchive = replacearg(webarchive, "url", tup.url, "urlchanger2.2")
subs(orig, "", p.field[i])
subs(url, res, p.field[i])
p.field[i] = p.field[i] & webarchive
p.ok += inclog("urlchanger2.1", GX.esurlchange, Project.syslog, url & " ---- " & res & " ---- delete webarchive (removed archive)")
inc(tot)
else:
sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger2.2")
# Replace in [state.gov] ({dead}{cbignore})?
psplit(GX.articlework, "[[][ ]*" & fullurl & "[^]]*[]][ ]*(" & GX.dead & "[ ]*(" & GX.cbignore & ")?)?", p):
if awk.match(p.field[i], fullurl, url) > 0:
res = checklinkredir(url, p.field[i])
gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i])
gsub(GX.dead, "", p.field[i])
if not empty(res) and res !~ "DEADLINK$":
gsubs(url, res, p.field[i])
#CUSTOM - changes to square-link title field
gsub("(?i)chartstats[.](org|com)", "Official Charts Company", p.field[i])
gsub("(?i)charts?[ ]?stats", "Official Charts Company", p.field[i])
gsub("(?i)UK (singles|album) charts?", "Official Charts Company", p.field[i])
gsub("[(]Link redirected to OCC website[)]", "", p.field[i])
p.ok += inclog("urlchanger4.1", GX.esurlchange, Project.syslog, url & " ---- " & res & " ---- modify squarelink")
inc(tot)
else: # add archive
if addarchive and res != "SKIPDEADLINK":
if match(GX.articlework, escapeRe(p.field[i]) & GX.space & GX.webarchive, dest) == 0: # skip if followed by {{webarchive}}
if res != "DEADLINK":
(head, bodyfilename) = getheadbody(url, "one") # check orginal URL is dead
if headerresponse(head) != 200 or res == "DEADLINK":
gsubs(url, "https://web.archive.org/web/18990101080101/" & url, p.field[i])
sed("Converting to 1899 (2): " & p.field[i], Debug.network)
inc(p.ok)
inc(tot)
# replace standalone {{webarchive}} - should come after the above for urlchanger3.2 to work
psplit(GX.articlework, GX.webarchive, p):
url = getarg("url", "clean", p.field[i])
if url ~ fullurl:
if awk.match(GX.articlework, "[]][ ]*" & escapeRe(p.field[i])) == 0: # skip [state.gov] {{webarchive}}
sourceurl = urlurl(url)
res = checklinkredir(sourceurl, p.field[i])
if not empty(res) and res !~ "DEADLINK$":
title = getarg("title", "clean", p.field[i])
if not empty(title):
p.field[i] = "[" & res & " " & title & "]"
else:
p.field[i] = "[" & res & "]"
p.ok += inclog("urlchanger3.1", GX.esurlchange, Project.syslog, sourceurl & " ---- " & res & " ---- replace webarchive")
inc(tot)
if countsubstring(GX.articlework, res) > 1: # look for bugs
sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- bug in standalone webarchive conversion ---- urlchanger3.2")
# Replace [archive.org/state.gov] with [state.gov] {{webarchive}}
psplit(GX.articlework, "[[][ ]*https?[:][/]{2}(www[.]|web[.])?archive[.](org|today|is)[/](web[/])?[0-9]{14}[/]" & fullurl & "[^]]*[]]", p):
if awk.match(p.field[i], fullurl, url) > 0:
gsub("[/]$", "", url)
awk.match(p.field[i], "https?[:][/]{2}(www[.]|web[.])?archive[.](org|today|is)[/](web[/])?[0-9]{14}[/]" & fullurl, archiveurl)
res = checklinkredir(url, p.field[i])
if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(archiveurl) ):
var tup: tuple[url: string, status: int, response: int]
tup = queryapiget(res, urltimestamp(archiveurl) )
if tup.status == 1 and not empty(timestamp2numericdate(urltimestamp(archiveurl))):
p.field[i] = "[" & res & "]" & "{{webarchive |url=" & archiveurl & " |date=" & timestamp2numericdate(urltimestamp(archiveurl)) & "}}"
p.ok += inclog("urlchanger5.1", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace archive squarelink")
inc(tot)
else:
sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger5.2")
# gsubs(archiveurl, res, p.field[i])
# p.ok += inclog("urlchanger5.1", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace archived squarelink")
# inc(tot)
# Replace [webcitation.org/query?url=https://state.gov] with [state.gov] (webcite.org/query?url=https://etc..)
psplit(GX.articlework, "[[][ ]*https?[:][/]{2}(www[.]|web[.])?webcitation[.]org[/]query[?]url=" & fullurl & "[^]]*[]]", p):
if awk.match(p.field[i], fullurl, url) > 0:
gsub("[/]$", "", url)
awk.match(p.field[i], "https?[:][/]{2}(www[.]|web[.])?webcitation[.]org[/]query[?]url=" & fullurl, archiveurl)
res = checklinkredir(url, p.field[i])
if not empty(res) and res !~ "DEADLINK$" and res != url:
gsubs(archiveurl, res, p.field[i])
p.ok += inclog("urlchanger5.2", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace webcitationquary" )
inc(tot)
# If url is already switched to new but archive-url and other metadata for old URL still exists
psplit(GX.articlework, GX.cite2, p):
prurl = getarg("url", "clean", p.field[i])
if prurl ~ Renew3: # "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com"
var f = 0
var g = 0
if getarg("archive-url", "clean", p.field[i]) ~ fullurl:
awk.split("archive-url archive-date url-status", a, " ")
for k in 0..len(a) - 1:
if isarg(a[k], "exists", p.field[i]):
p.field[i] = gsubs(getarg(a[k], "bar", p.field[i]), "", p.field[i])
inc(f)
# CUSTOM field changes # change text in work, publisher etc..
awk.split("work website publisher title", a, " ")
for k in 0..len(a) - 1:
if isarg(a[k], "exists", p.field[i]):
var cleanarg = getarg(a[k], "clean", p.field[i])
if awk.match(cleanarg, Reold4, dest) > 0: # "(?i)(www[.])?old[.]com"
if a[k] !~ "(title|publisher)":
p.field[i] = replacearg(p.field[i], a[k], "new.com", "urlchanger5.3.1") # replace whole arg value with new URL
inc(g)
else:
cleanarg = gsubs(dest, "new.com", cleanarg) # replace string within arg value new URL
p.field[i] = replacearg(p.field[i], a[k], cleanarg, "urlchanger5.3.2")
inc(g)
# add more cases here. See urlchanger-chartstats.nim for broader examples
# CUSTOM field changes
# delete |publisher if |work has same info .. new URL .. old URL
# Reold4 = "(?i)(www[.])?old[.]com" Renew4 = "(?i)(www[.])?new[.]com"
if getarg("work", "clean", p.field[i]) ~ Reold4 and getarg("publisher", "clean", p.field[i]) ~ Renew4:
gsubs(getarg("publisher", "bar", p.field[i]), "", p.field[i])
# p.field[i] = replacearg(p.field[i], "work", "[[Official Charts Company]]", "urlchanger5.3.3")
inc(g)
if getarg("website", "clean", p.field[i]) ~ Reold4 and getarg("publisher", "clean", p.field[i]) ~ Renew4:
gsubs(getarg("publisher", "bar", p.field[i]), "", p.field[i])
# p.field[i] = replacearg(p.field[i], "work", "[[Official Charts Company]]", "urlchanger5.3.4")
inc(g)
if f > 0:
p.ok += inclog("urlchanger5.3", GX.esurlchange, Project.urlchanger, prurl & " ---- remove archive-url")
inc(tot)
if g > 0:
p.ok += inclog("urlchanger5.3", GX.esurlchange, Project.urlchanger, prurl & " ---- update metadata")
inc(tot)
# If url (any type) doesn't match the domain-name in work|publisher for the custom domain
psplit(GX.articlework, GX.cite2, p):
prurl = getarg("url", "clean", p.field[i])
if prurl !~ Renew3: # "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com"
awk.split("work website publisher", a, " ")
for k in 0..len(a) - 1:
if isarg(a[k], "exists", p.field[i]):
var cleanarg = getarg(a[k], "clean", p.field[i])
if awk.match(cleanarg, Renew4, dest) > 0: # "(?i)(www[.])?new[.]com"
p.field[i] = replacearg(p.field[i], a[k], uriparseElement(prurl, "hostname"), "urlchanger5.4") # replace whole arg value
p.ok += inclog("urlchanger5.4", GX.esurlchange, Project.urlchanger, prurl & " ---- " & a[k] & " ---- remove stray domain in work.etc field")
inc(tot)
# change <ref name=string/>
psplit(GX.articlework, "<ref[^>]*>", p):
if p.field[i] ~ Reold4: # "(?i)(www[.])?old[.]com"
gsub(Reold4, Renew5, p.field[i])
p.ok += inclog("urlchanger5.5", GX.esurlchange, Project.urlchanger, p.field[i] & " ---- change ref name=" & Renew5)
inc(tot)
# Bare URLs with no square bracket
# step 1: Count bare links with no square brackets and save in associative-array aar[]
var aar = initTable[string, int]()
(head, bodyfilename) = getheadbody("https://en.wikipedia.orghttps://demo.azizisearch.com/lite/wikipedia/page/" & quote(CL.name), "one") # scrape body
fpHTML = readfile(bodyfilename)
if not empty(fpHTML):
psplit(fpHTML, "[>]http[^<]+[<][/][Aa][>]", p):
gsub("^[>]|[<][/][Aa][>]$", "", p.field[i])
if awk.match(p.field[i], fullurl, dest) > 0:
if len(p.field[i]) == len(dest) and GX.articlework !~ ("https://web.archive.org/web/18990101080101/" & dest):
if hasKey(aar, p.field[i]):
inc(aar[p.field[i]])
else:
aar[p.field[i]] = 1
aar[convertxml(p.field[i])] = 1 # catch all possibilities as URLs are sometimes HTML-encoded and sometimes not
# step 2: make sure the number of bare links equals number of URLs otherwise log and skip
# replace all the URLs with gsub()
for aurl in aar.keys:
# se("AURL0 = " & aurl)
# se("AURL1 = " & $aar[aurl])
# se("AURL2 = " & $countsubstring(GX.articlework, aurl))
if countsubstring(GX.articlework, aurl) == aar[aurl] and countsubstring(GX.articlework, "/" & aurl) == 0:
# (CL.name & "---- " & aurl & " ---- Orphan link ---- checklinkexists1.1") >> Project.meta & logfile
var res = checklinkredir(aurl, "")
# se("RES = " & res)
if (empty(res) or res == "DEADLINK") and res != "SKIPDEADLINK":
if addarchive:
gsubs(aurl, "[https://web.archive.org/web/18990101080101/" & aurl & "]", GX.articlework)
sed("Converting to 1899 (3): " & aurl, Debug.network)
inc(tot)
elif not empty(res) and res !~ "DEADLINK$":
for i in 1..aar[aurl]:
inclog("urlchanger8.1", GX.esurlchange, Project.syslog, aurl & " ---- " & res)
inc(tot)
gsubs(aurl, res, GX.articlework)
elif convertxml(aurl) == aurl and countsubstring(GX.articlework, aurl) > aar[aurl]:
sendlog(Project.urlchanger, CL.name, aurl & " ---- Too many bare URLs ---- urlchanger8.2")
elif convertxml(aurl) == aurl and countsubstring(GX.articlework, aurl) < aar[aurl]:
sendlog(Project.urlchanger, CL.name, aurl & " ---- Bare URLs missing ---- urlchanger8.3")
#CUSTOM
# split into <ref></ref> and take actions in them. This will catch hard to fix items like a domain name outside a square link
let cc = awk.split(GX.articlework, bb, "<ref[^>]*>")
for z in 0..cc - 1:
if(len(bb[z]) > 1):
var endref = index(bb[z], "</ref>")
if(endref > 1):
var kk = substr(bb[z], 0, endref - 1)
#se("KK = " & kk)
Renew3 = "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com"
if kk ~ Renew3 and kk ~ ("(?i)[ .,-]" & ReoldA):
var orig = kk
# see also urlchanger-msnbc
if match(kk, Renew3 & GX.endurlcs, hideurl) > 0:
gsubs(hideurl, "__hideurl__", kk)
gsub("(?i)(www)?[ .,-]" & RenewA, " " & RenewB, kk)
gsubs("__hideurl__", hideurl, kk)
#se("NEW = " & kk)
GX.articlework = replacefullref(orig, orig, kk, "citeurlchanger1")
inclog("urlchanger9.1", GX.esurlchange, Project.urlchanger, orig & " ---- " & kk & " ---- change floating cite")
# Sometimes Love Just Ain't Enough
inc(tot)
if tot == 0:
sendlog(Project.urlchanger, CL.name, " ---- None found ---- urlchanger9.2")
return true