1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot
21
22 See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and
23 usage instructions
24 """
25 import os
26 import re
27 import sys
28 import logging
29
30 from translate.lang import factory as lang_factory
31 from translate.misc import optrecurse
32 from translate.storage import po
33 from translate.storage import factory
34 from translate.misc import file_discovery
35
36
37 -def create_termunit(term, unit, targets, locations, sourcenotes, transnotes, filecounts):
38 termunit = po.pounit(term)
39 if unit is not None:
40 termunit.merge(unit, overwrite=False, comments=False)
41 if len(targets.keys()) > 1:
42 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
43 for target, files in targets.iteritems()])
44 if termunit.target.find('};') < 0:
45 termunit.target = txt
46 termunit.markfuzzy()
47 else:
48
49 termunit.addnote(txt, "translator")
50 for location in locations:
51 termunit.addlocation(location)
52 for sourcenote in sourcenotes:
53 termunit.addnote(sourcenote, "developer")
54 for transnote in transnotes:
55 termunit.addnote(transnote, "translator")
56 for filename, count in filecounts.iteritems():
57 termunit.addnote("(poterminology) %s (%d)\n" % (filename, count), 'translator')
58 return termunit
59
60
62
65 self.foldtitle = foldtitle
66 self.ignorecase = ignorecase
67 self.accelchars = accelchars
68 self.termlength = termlength
69
70 self.sourcelanguage = sourcelanguage
71 self.invert = invert
72
73 self.stopwords = {}
74 self.stoprelist = []
75 self.stopfoldtitle = True
76 self.stopignorecase = False
77
78 if stopfile is None:
79 try:
80 stopfile = file_discovery.get_abs_data_filename('stoplist-%s' % self.sourcelanguage)
81 except:
82 pass
83 self.stopfile = stopfile
84 self.parse_stopword_file()
85
86
87 self.formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
88
89 self.xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
90
91 self.xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
92 flags=re.UNICODE|re.IGNORECASE)
93
94 self.units = 0
95 self.glossary = {}
96
98
99 actions = {'+': frozenset(), ':': frozenset(['skip']),
100 '<': frozenset(['phrase']), '=': frozenset(['word']),
101 '>': frozenset(['word', 'skip']),
102 '@': frozenset(['word', 'phrase'])}
103
104 stopfile = open(self.stopfile, "r")
105 line = 0
106 try:
107 for stopline in stopfile:
108 line += 1
109 stoptype = stopline[0]
110 if stoptype == '#' or stoptype == "\n":
111 continue
112 elif stoptype == '!':
113 if stopline[1] == 'C':
114 self.stopfoldtitle = False
115 self.stopignorecase = False
116 elif stopline[1] == 'F':
117 self.stopfoldtitle = True
118 self.stopignorecase = False
119 elif stopline[1] == 'I':
120 self.stopignorecase = True
121 else:
122 logging.warning("%s line %d - bad case mapping directive", (self.stopfile, line))
123 elif stoptype == '/':
124 self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
125 else:
126 self.stopwords[stopline[1:-1]] = actions[stoptype]
127 except KeyError, character:
128 logging.warning("%s line %d - bad stopword entry starts with", (self.stopfile, line))
129 logging.warning("%s line %d all lines after error ignored", (self.stopfile, line + 1))
130 stopfile.close()
131
133 """returns the cleaned string that contains the text to be matched"""
134 for accelerator in self.accelchars:
135 string = string.replace(accelerator, "")
136 string = self.formatpat.sub(" ", string)
137 string = self.xmlelpat.sub(" ", string)
138 string = self.xmlentpat.sub(" ", string)
139 string = string.strip()
140 return string
141
143 """return case-mapped stopword for input word"""
144 if self.stopignorecase or (self.stopfoldtitle and word.istitle()):
145 word = word.lower()
146 return word
147
149 """return stoplist frozenset for input word"""
150 return self.stopwords.get(self.stopmap(word), defaultset)
151
153 """adds (sub)phrases with non-skipwords and more than one word"""
154 if (len(words) > skips + 1 and
155 'skip' not in self.stopword(words[0]) and
156 'skip' not in self.stopword(words[-1])):
157 self.glossary.setdefault(' '.join(words), []).append(translation)
158 if partials:
159 part = list(words)
160 while len(part) > 2:
161 if 'skip' in self.stopword(part.pop()):
162 skips -= 1
163 if (len(part) > skips + 1 and
164 'skip' not in self.stopword(part[0]) and
165 'skip' not in self.stopword(part[-1])):
166 self.glossary.setdefault(' '.join(part), []).append(translation)
167
169 sourcelang = lang_factory.getlanguage(self.sourcelanguage)
170 rematchignore = frozenset(('word', 'phrase'))
171 defaultignore = frozenset()
172 for unit in units:
173 self.units += 1
174 if unit.isheader():
175 continue
176 if not self.invert:
177 source = self.clean(unit.source)
178 target = self.clean(unit.target)
179 else:
180 target = self.clean(unit.source)
181 source = self.clean(unit.target)
182 if len(source) <= 1:
183 continue
184 for sentence in sourcelang.sentences(source):
185 words = []
186 skips = 0
187 for word in sourcelang.words(sentence):
188 stword = self.stopmap(word)
189 if self.ignorecase or (self.foldtitle and word.istitle()):
190 word = word.lower()
191 ignore = defaultignore
192 if stword in self.stopwords:
193 ignore = self.stopwords[stword]
194 else:
195 for stopre in self.stoprelist:
196 if stopre.match(stword) != None:
197 ignore = rematchignore
198 break
199 translation = (source, target, unit, fullinputpath)
200 if 'word' not in ignore:
201
202 root = word
203 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
204 root = word[0:-1]
205 elif len(root) > 2 and root + 's' in self.glossary:
206 self.glossary[root] = self.glossary.pop(root + 's')
207 self.glossary.setdefault(root, []).append(translation)
208 if self.termlength > 1:
209 if 'phrase' in ignore:
210
211 while len(words) > 2:
212 if 'skip' in self.stopword(words.pop(0)):
213 skips -= 1
214 self.addphrases(words, skips, translation)
215 words = []
216 skips = 0
217 else:
218 words.append(word)
219 if 'skip' in ignore:
220 skips += 1
221 if len(words) > self.termlength + skips:
222 while len(words) > self.termlength + skips:
223 if 'skip' in self.stopword(words.pop(0)):
224 skips -= 1
225 self.addphrases(words, skips, translation)
226 else:
227 self.addphrases(words, skips, translation, partials=False)
228 if self.termlength > 1:
229
230 while self.termlength > 1 and len(words) > 2:
231
232 if 'skip' in self.stopword(words.pop(0)):
233 skips -= 1
234 self.addphrases(words, skips, translation)
235
237 terms = {}
238 locre = re.compile(r":[0-9]+$")
239 print >> sys.stderr, ("%d terms from %d units" %
240 (len(self.glossary), self.units))
241 for term, translations in self.glossary.iteritems():
242 if len(translations) <= 1:
243 continue
244 filecounts = {}
245 sources = set()
246 locations = set()
247 sourcenotes = set()
248 transnotes = set()
249 targets = {}
250 fullmsg = False
251 bestunit = None
252 for source, target, unit, filename in translations:
253 sources.add(source)
254 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
255
256 if term.lower() == self.clean(unit.source).lower():
257 fullmsg = True
258 target = self.clean(unit.target)
259 if self.ignorecase or (self.foldtitle and target.istitle()):
260 target = target.lower()
261 unit.target = target
262 if target != "":
263 targets.setdefault(target, []).append(filename)
264 if term.lower() == unit.source.strip().lower():
265 sourcenotes.add(unit.getnotes("source code"))
266 transnotes.add(unit.getnotes("translator"))
267 unit.source = term
268 bestunit = unit
269
270
271 for loc in unit.getlocations():
272 locations.add(locre.sub("", loc))
273
274 numsources = len(sources)
275 numfiles = len(filecounts)
276 numlocs = len(locations)
277 if numfiles < inputmin or 0 < numlocs < locmin:
278 continue
279 if fullmsg:
280 if numsources < fullmsgmin:
281 continue
282 elif numsources < substrmin:
283 continue
284
285 locmax = 2 * locmin
286 if numlocs > locmax:
287 locations = list(locations)[0:locmax]
288 locations.append("(poterminology) %d more locations"
289 % (numlocs - locmax))
290
291 termunit = create_termunit(term, bestunit, targets, locations, sourcenotes, transnotes, filecounts)
292 terms[term] = ((10 * numfiles) + numsources, termunit)
293 return terms
294
296 """reduce subphrases from extracted terms"""
297
298 termlist = terms.keys()
299 print >> sys.stderr, "%d terms after thresholding" % len(termlist)
300 termlist.sort(lambda x, y: cmp(len(x), len(y)))
301 for term in termlist:
302 words = term.split()
303 nonstop = [word for word in words if not self.stopword(word)]
304 if len(nonstop) < nonstopmin and len(nonstop) != len(words):
305 del terms[term]
306 continue
307 if len(words) <= 2:
308 continue
309 while len(words) > 2:
310 words.pop()
311 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
312 del terms[' '.join(words)]
313 words = term.split()
314 while len(words) > 2:
315 words.pop(0)
316 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
317 del terms[' '.join(words)]
318 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
319 termitems = terms.values()
320 while len(sortorders) > 0:
321 order = sortorders.pop()
322 if order == "frequency":
323 termitems.sort(lambda x, y: cmp(y[0], x[0]))
324 elif order == "dictionary":
325 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
326 elif order == "length":
327 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
328 else:
329 logging.warning("unknown sort order %s", order)
330 return termitems
331
332
334 """a specialized Option Parser for the terminology tool..."""
335
337 """parses the command line options, handling implicit input/output args"""
338 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
339
340 if args and not options.input:
341 if not options.output and not options.update and len(args) > 1:
342 options.input = args[:-1]
343 args = args[-1:]
344 else:
345 options.input = args
346 args = []
347
348
349 if args and not options.output and not options.update:
350 if os.path.lexists(args[-1]) and not os.path.isdir(args[-1]):
351 self.error("To overwrite %s, specify it with -o/--output or -u/--update" % (args[-1]))
352 options.output = args[-1]
353 args = args[:-1]
354 if options.output and options.update:
355 self.error("You cannot use both -u/--update and -o/--output")
356 if args:
357 self.error("You have used an invalid combination of -i/--input, -o/--output, -u/--update and freestanding args")
358 if not options.input:
359 self.error("No input file or directory was specified")
360 if isinstance(options.input, list) and len(options.input) == 1:
361 options.input = options.input[0]
362 if options.inputmin == None:
363 options.inputmin = 1
364 elif not isinstance(options.input, list) and not os.path.isdir(options.input):
365 if options.inputmin == None:
366 options.inputmin = 1
367 elif options.inputmin == None:
368 options.inputmin = 2
369 if options.update:
370 options.output = options.update
371 if isinstance(options.input, list):
372 options.input.append(options.update)
373 elif options.input:
374 options.input = [options.input, options.update]
375 else:
376 options.input = options.update
377 if not options.output:
378 options.output = "pootle-terminology.pot"
379 return (options, args)
380
382 """sets the usage string - if usage not given, uses getusagestring for each option"""
383 if usage is None:
384 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
385 "\n input directory is searched for PO files, terminology PO file is output file"
386 else:
387 super(TerminologyOptionParser, self).set_usage(usage)
388
401
403 """recurse through directories and process files"""
404 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
405 if isinstance(options.input, list):
406 inputfiles = self.recurseinputfilelist(options)
407 else:
408 inputfiles = self.recurseinputfiles(options)
409 else:
410 if options.input:
411 inputfiles = [os.path.basename(options.input)]
412 options.input = os.path.dirname(options.input)
413 else:
414 inputfiles = [options.input]
415 if os.path.isdir(options.output):
416 options.output = os.path.join(options.output, "pootle-terminology.pot")
417
418 self.initprogressbar(inputfiles, options)
419 for inputpath in inputfiles:
420 self.files += 1
421 fullinputpath = self.getfullinputpath(options, inputpath)
422 success = True
423 try:
424 self.processfile(None, options, fullinputpath)
425 except Exception, error:
426 if isinstance(error, KeyboardInterrupt):
427 raise
428 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
429 success = False
430 self.reportprogress(inputpath, success)
431 del self.progressbar
432 self.outputterminology(options)
433
434 - def processfile(self, fileprocessor, options, fullinputpath):
439
441 """saves the generated terminology glossary"""
442 termfile = po.pofile()
443 print >> sys.stderr, ("scanned %d files" % self.files)
444 terms = self.extractor.extract_terms(inputmin=options.inputmin, fullmsgmin=options.fullmsgmin,
445 substrmin=options.substrmin, locmin=options.locmin)
446 termitems = self.extractor.filter_terms(terms, nonstopmin=options.nonstopmin, sortorders=options.sortorders)
447 for count, unit in termitems:
448 termfile.units.append(unit)
449 open(options.output, "w").write(str(termfile))
450
451
453 parser.values.ignorecase = False
454 parser.values.foldtitle = True
455
456
458 parser.values.ignorecase = parser.values.foldtitle = False
459
460
462 formats = {"po": ("po", None), "pot": ("pot", None), None: ("po", None)}
463 parser = TerminologyOptionParser(formats)
464
465 parser.add_option("-u", "--update", type="string", dest="update",
466 metavar="UPDATEFILE", help="update terminology in UPDATEFILE")
467
468 parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile",
469 help="read stopword (term exclusion) list from STOPFILE (default %s)" %
470 file_discovery.get_abs_data_filename('stoplist-en'))
471
472 parser.set_defaults(foldtitle=True, ignorecase=False)
473 parser.add_option("-F", "--fold-titlecase", callback=fold_case_option,
474 action="callback", help="fold \"Title Case\" to lowercase (default)")
475 parser.add_option("-C", "--preserve-case", callback=preserve_case_option,
476 action="callback", help="preserve all uppercase/lowercase")
477 parser.add_option("-I", "--ignore-case", dest="ignorecase",
478 action="store_true", help="make all terms lowercase")
479
480 parser.add_option("", "--accelerator", dest="accelchars", default="",
481 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
482
483 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
484 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
485 parser.add_option("", "--nonstop-needed", type="int", dest="nonstopmin", default="1",
486 help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN")
487 parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
488 help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN")
489 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
490 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
491 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
492 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
493 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
494 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
495
496 sortorders_default = ["frequency", "dictionary", "length"]
497 parser.add_option("", "--sort", dest="sortorders", action="append",
498 type="choice", choices=sortorders_default, metavar="ORDER", default=sortorders_default,
499 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(sortorders_default))
500
501 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
502 help="the source language code (default 'en')", metavar="LANG")
503 parser.add_option("-v", "--invert", dest="invert",
504 action="store_true", default=False, help="invert the source and target languages for terminology")
505 parser.set_usage()
506 parser.description = __doc__
507 parser.run()
508
509
510 if __name__ == '__main__':
511 main()
512