Utilisateur:Tavernier/cosmetic changes.py
Apparence
(Redirigé depuis Utilisateur:J.M. Tavernier/cosmetic changes.py)
/*
*/
# -*- coding: utf-8 -*-
"""
This module can do slight modifications to a wiki page source code such that
the code looks cleaner. The changes are not supposed to change the look of the
rendered wiki page.
WARNING: This module needs more testing!
"""
import wikipedia, pagegenerators
import sys
import re
# Summary message when using this module as a stand-alone script
msg_standalone = {
'de': u'Bot: Kosmetische Änderungen',
'en': u'Robot: Cosmetic changes',
'fr': u'Bot: Retouches cosmétiques',
'he': u'רובוט: שינויים קוסמטיים',
'pt': u'Bot: Mudanças triviais',
}
# Summary message that will be appended to the normal message when
# cosmetic changes are made on the fly
msg_append = {
'de': u'; kosmetische Änderungen',
'en': u'; cosmetic changes',
'fr': u'; retouches cosmétiques',
'he': u'; שינויים קוסמטיים',
'pt': u'; mudanças triviais',
}
deprecatedTemplates = {
'wikipedia': {
'de': [
u'Stub',
]
}
}
class CosmeticChangesToolkit:
def __init__(self, site, debug = False, acceptall = False):
self.site = site
self.debug = debug
def change(self, text):
"""
Given a wiki source code text, returns the cleaned up version.
"""
oldText = text
text = self.standardizeInterwiki(text)
text = self.standardizeCategories(text)
text = self.cleanUpLinks(text)
text = self.cleanUpSectionHeaders(text)
text = self.translateAndCapitalizeNamespaces(text)
text = self.removeDeprecatedTemplates(text)
text = self.resolveHtmlEntities(text)
text = self.validXhtml(text)
text = self.removeUselessSpaces(text)
if self.debug:
wikipedia.showDiff(oldText, text)
return text
def standardizeInterwiki(self, text):
"""
Makes sure that interwiki links are put to the correct position and
into the right order.
"""
interwikiLinks = wikipedia.getLanguageLinks(text, insite = self.site)
text = wikipedia.replaceLanguageLinks(text, interwikiLinks, site = self.site)
return text
def standardizeCategories(self, text):
"""
Makes sure that interwiki links are put to the correct position, but
does not sort them.
"""
categories = wikipedia.getCategoryLinks(text, site = self.site)
text = wikipedia.replaceCategoryLinks(text, categories, site = self.site)
return text
def translateAndCapitalizeNamespaces(self, text):
"""
Makes sure that localized namespace names are used.
"""
family = self.site.family
for nsNumber in family.namespaces:
thisNs = family.namespace(self.site.lang, nsNumber)
defaultNs = family.namespace('_default', nsNumber)
if thisNs != defaultNs:
text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + defaultNs + '\s*:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]')
if self.site.nocapitalize:
for nsNumber in family.namespaces:
thisNs = family.namespace(self.site.lang, nsNumber)
lowerNs = thisNs[0].lower() + thisNs[1:] # this assumes that all NS names have length at least 2
text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + lowerNs + '\s*:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]')
return text
def cleanUpLinks(self, text):
trailR = re.compile(self.site.linktrail())
# The regular expression which finds links. Results consist of four groups:
# group title is the target page title, that is, everything before | or ].
# group section is the page section. It'll include the # to make life easier for us.
# group label is the alternative link title, that's everything between | and ].
# group linktrail is the link trail, that's letters after ]] which are part of the word.
# note that the definition of 'letter' varies from language to language.
self.linkR = re.compile(r'\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')')
curpos = 0
# This loop will run until we have finished the current page
while True:
m = self.linkR.search(text, pos = curpos)
if not m:
break
# Make sure that next time around we will not find this same hit.
curpos = m.start() + 1
titleWithSection = m.group('titleWithSection')
if not self.site.isInterwikiLink(titleWithSection):
# The link looks like this:
# link_texttrailing_chars
# We only work on namespace 0 because pipes and linktrails work
# differently for images and categories.
page = wikipedia.Page(self.site, titleWithSection)
if page.namespace() == 0:
# Replace underlines by spaces, also multiple underlines
titleWithSection = re.sub('_+', ' ', titleWithSection)
# Remove double spaces
titleWithSection = re.sub(' +', ' ', titleWithSection)
# Convert URL-encoded characters to unicode
titleWithSection = wikipedia.url2unicode(titleWithSection, site = self.site)
label = m.group('label') or titleWithSection
trailingChars = m.group('linktrail')
if trailingChars:
label += trailingChars
if titleWithSection == label:
newLink = "%s" % titleWithSection
# Check if we can create a link with trailing characters instead of a pipelink
elif len(titleWithSection) <= len(label) and label[:len(titleWithSection)] == titleWithSection and re.sub(trailR, , label[len(titleWithSection):]) == :
newLink = "%s%s" % (label[:len(titleWithSection)], label[len(titleWithSection):])
else:
# Try to capitalize the first letter of the title.
# Maybe this feature is not useful for languages that
# don't capitalize nouns...
#if not self.site.nocapitalize:
if self.site.sitename() == 'wikipedia:de':
titleWithSection = titleWithSection[0].upper() + titleWithSection[1:]
newLink = "%s" % (titleWithSection, label)
text = text[:m.start()] + newLink + text[m.end():]
return text
def resolveHtmlEntities(self, text):
ignore = [
38, # Ampersand (&)
60, # Less than (<)
62, # Less than (<)
160, # Non-breaking space ( ) - not supported by Firefox textareas
]
text = wikipedia.html2unicode(text, ignore = ignore)
return text
def validXhtml(self, text):
text = wikipedia.replaceExceptNowikiAndComments(text, r'
', r'
')
return text
def removeUselessSpaces(self, text):
result = []
multipleSpacesR = re.compile(' +')
spaceAtLineEndR = re.compile(' $')
preR = re.compile('<pre', re.IGNORECASE)
lines = text.split('\n')
for line in lines:
if len(line) > 0 and line[0] != ' ' and not preR.search(line):
line = wikipedia.replaceExceptMathNowikiAndComments(line, multipleSpacesR, ' ')
line = wikipedia.replaceExceptMathNowikiAndComments(line, spaceAtLineEndR, )
result.append(line)
return '\n'.join(result)
def cleanUpSectionHeaders(self, text):
for level in range(1, 7):
equals = '=' * level
text = wikipedia.replaceExceptNowikiAndComments(text, r'\n' + equals + ' *(?P<title>[^=]+?) *' + equals + ' *\r\n', r'\n' + equals + ' \g<title> ' + equals + '\r\n')
return text
def removeDeprecatedTemplates(self, text):
if deprecatedTemplates.has_key(self.site.family.name) and deprecatedTemplates[self.site.family.name].has_key(self.site.lang):
for template in deprecatedTemplates[self.site.family.name][self.site.lang]:
if not self.site.nocapitalize:
template = '[' + template[0].upper() + template[0].lower() + ']' + template[1:]
text = wikipedia.replaceExceptNowikiAndComments(text, r'\{\{([mM][sS][gG]:)?' + template + '(?P<parameters>\|[^}]+|)}}', )
return text
class CosmeticChangesBot:
def __init__(self, generator, acceptall = False):
self.generator = generator
self.acceptall = acceptall
# Load default summary message.
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_standalone))
def run(self):
for page in self.generator:
try:
ccToolkit = CosmeticChangesToolkit(page.site(), debug = True)
changedText = ccToolkit.change(page.get())
if changedText != page.get():
if not self.acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
if choice in ['a', 'A']:
self.acceptall = True
if self.acceptall or choice in ['y', 'Y']:
page.put(changedText)
except wikipedia.NoPage:
print "Page %s does not exist?!" % page.aslink()
except wikipedia.IsRedirectPage:
print "Page %s is a redirect; skipping." % page.aslink()
except wikipedia.LockedPage:
print "Page %s is locked?!" % page.aslink()
def main():
#page generator
gen = None
pageTitle = []
for arg in wikipedia.handleArgs():
if arg.startswith('-start:'):
gen = pagegenerators.AllpagesPageGenerator(arg[7:])
elif arg.startswith('-ref:'):
referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif arg.startswith('-links:'):
linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
gen = pagegenerators.LinkedPageGenerator(linkingPage)
elif arg.startswith('-file:'):
gen = pagegenerators.TextfilePageGenerator(arg[6:])
elif arg.startswith('-cat:'):
cat = catlib.Category(wikipedia.getSite(), arg[5:])
gen = pagegenerators.CategorizedPageGenerator(cat)
elif arg == '-always':
acceptall = True
else:
pageTitle.append(arg)
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
gen = iter([page])
if not gen:
wikipedia.showHelp()
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen)
bot = CosmeticChangesBot(preloadingGen, acceptall)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()