1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Module to provide a translation memory database."""
23
24 import logging
25 import math
26 import re
27 import threading
28 import time
29 try:
30 from sqlite3 import dbapi2
31 except ImportError:
32 from pysqlite2 import dbapi2
33
34 from translate.search.lshtein import LevenshteinComparer
35 from translate.lang import data
36
37
38 STRIP_REGEXP = re.compile("\W", re.UNICODE)
39
40
42
45
47 return str(self.value)
48
49
51 _tm_dbs = {}
52
53 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
54
55 self.max_candidates = max_candidates
56 self.min_similarity = min_similarity
57 self.max_length = max_length
58
59 self.db_file = db_file
60
61 if db_file not in self._tm_dbs:
62 self._tm_dbs[db_file] = {}
63 self._tm_db = self._tm_dbs[db_file]
64
65
66 self.init_database()
67 self.fulltext = False
68 self.init_fulltext()
69
70 self.comparer = LevenshteinComparer(self.max_length)
71
72 self.preload_db()
73
75 current_thread = threading.currentThread()
76 if current_thread not in self._tm_db:
77 connection = dbapi2.connect(self.db_file)
78 cursor = connection.cursor()
79 self._tm_db[current_thread] = (connection, cursor)
80 return self._tm_db[current_thread][index]
81
82 connection = property(lambda self: self._get_connection(0))
83 cursor = property(lambda self: self._get_connection(1))
84
86 """creates database tables and indices"""
87
88 script = """
89 CREATE TABLE IF NOT EXISTS sources (
90 sid INTEGER PRIMARY KEY AUTOINCREMENT,
91 text VARCHAR NOT NULL,
92 context VARCHAR DEFAULT NULL,
93 lang VARCHAR NOT NULL,
94 length INTEGER NOT NULL
95 );
96 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context);
97 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang);
98 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length);
99 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang);
100
101 CREATE TABLE IF NOT EXISTS targets (
102 tid INTEGER PRIMARY KEY AUTOINCREMENT,
103 sid INTEGER NOT NULL,
104 text VARCHAR NOT NULL,
105 lang VARCHAR NOT NULL,
106 time INTEGER DEFAULT NULL,
107 FOREIGN KEY (sid) references sources(sid)
108 );
109 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid);
110 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang);
111 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time);
112 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang);
113 """
114
115 try:
116 self.cursor.executescript(script)
117 self.connection.commit()
118 except:
119 self.connection.rollback()
120 raise
121
122 - def init_fulltext(self):
123 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does"""
124
125
126 try:
127 script = """
128 DROP TABLE IF EXISTS test_for_fts3;
129 CREATE VIRTUAL TABLE test_for_fts3 USING fts3;
130 DROP TABLE test_for_fts3;
131 """
132 self.cursor.executescript(script)
133 logging.debug("fts3 supported")
134
135
136 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'")
137 if not self.cursor.fetchone():
138
139 script = """
140 CREATE VIRTUAL TABLE fulltext USING fts3(text);
141 """
142 logging.debug("fulltext table not exists, creating")
143 self.cursor.executescript(script)
144 logging.debug("created fulltext table")
145 else:
146 logging.debug("fulltext table already exists")
147
148
149 script = """
150 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext);
151 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW
152 BEGIN
153 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text);
154 END;
155 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW
156 BEGIN
157 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid;
158 END;
159 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW
160 BEGIN
161 DELETE FROM fulltext WHERE docid = OLD.sid;
162 END;
163 """
164 self.cursor.executescript(script)
165 self.connection.commit()
166 logging.debug("created fulltext triggers")
167 self.fulltext = True
168
169 except dbapi2.OperationalError, e:
170 self.fulltext = False
171 logging.debug("failed to initialize fts3 support: " + str(e))
172 script = """
173 DROP TRIGGER IF EXISTS sources_insert_trig;
174 DROP TRIGGER IF EXISTS sources_update_trig;
175 DROP TRIGGER IF EXISTS sources_delete_trig;
176 """
177 self.cursor.executescript(script)
178
180 """ugly hack to force caching of sqlite db file in memory for
181 improved performance"""
182 if self.fulltext:
183 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid"""
184 else:
185 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid"""
186 self.cursor.execute(query)
187 (numrows,) = self.cursor.fetchone()
188 logging.debug("tmdb has %d records" % numrows)
189 return numrows
190
191 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
211
212 - def add_dict(self, unit, source_lang, target_lang, commit=True):
213 """inserts units represented as dictionaries in database"""
214 source_lang = data.normalize_code(source_lang)
215 target_lang = data.normalize_code(target_lang)
216 try:
217 try:
218 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
219 (unit["source"],
220 unit["context"],
221 source_lang,
222 len(unit["source"])))
223 sid = self.cursor.lastrowid
224 except dbapi2.IntegrityError:
225
226 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
227 (unit["source"],
228 unit["context"],
229 source_lang))
230 sid = self.cursor.fetchone()
231 (sid,) = sid
232 try:
233
234
235 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
236 (sid,
237 unit["target"],
238 target_lang,
239 int(time.time())))
240 except dbapi2.IntegrityError:
241
242 pass
243
244 if commit:
245 self.connection.commit()
246 except:
247 if commit:
248 self.connection.rollback()
249 raise
250
251 - def add_store(self, store, source_lang, target_lang, commit=True):
261
262 - def add_list(self, units, source_lang, target_lang, commit=True):
263 """insert all units in list into the database, units are
264 represented as dictionaries"""
265 count = 0
266 for unit in units:
267 self.add_dict(unit, source_lang, target_lang, commit=False)
268 count += 1
269 if commit:
270 self.connection.commit()
271 return count
272
274 """return TM suggestions for unit_source"""
275 if isinstance(unit_source, str):
276 unit_source = unicode(unit_source, "utf-8")
277 if isinstance(source_langs, list):
278 source_langs = [data.normalize_code(lang) for lang in source_langs]
279 source_langs = ','.join(source_langs)
280 else:
281 source_langs = data.normalize_code(source_langs)
282 if isinstance(target_langs, list):
283 target_langs = [data.normalize_code(lang) for lang in target_langs]
284 target_langs = ','.join(target_langs)
285 else:
286 target_langs = data.normalize_code(target_langs)
287
288 minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
289 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length)
290
291
292
293 unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
294 unit_words = filter(lambda word: len(word) > 2, unit_words)
295
296 if self.fulltext and len(unit_words) > 3:
297 logging.debug("fulltext matching")
298 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
299 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
300 AND fulltext MATCH ?"""
301 search_str = " OR ".join(unit_words)
302 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str))
303 else:
304 logging.debug("nonfulltext matching")
305 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
306 WHERE s.lang IN (?) AND t.lang IN (?)
307 AND s.length >= ? AND s.length <= ?"""
308 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen))
309
310 results = []
311 for row in self.cursor:
312 result = {}
313 result['source'] = row[0]
314 result['target'] = row[1]
315 result['context'] = row[2]
316 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity)
317 if result['quality'] >= self.min_similarity:
318 results.append(result)
319 results.sort(key=lambda match: match['quality'], reverse=True)
320 results = results[:self.max_candidates]
321 logging.debug("results: %s", unicode(results))
322 return results
323
324
326 return math.ceil(max(length * (min_similarity/100.0), 2))
327
328
330 return math.floor(min(length / (min_similarity/100.0), max_length))
331