grassyknoll.backend.lucene.Smarts

1 """a higher level L{lucene} wrapper with intelligence""" 2 import os.path 3 import logging 4 import datetime 5 from itertools import imap 6 7 from lucene import (Field, Document, Hit, Term, IndexWriter, IndexReader, 8 IndexSearcher, Query, FSDirectory, RAMDirectory, Document, StandardAnalyzer, 9 QueryParser) 10 11 from HitsWrapper import HitsWrapper 12 import TermIterator 13 from LuceneConstants import Constants 14 import conversion 15 16 17 __all__=['SmartDoc', 'SmartField', 'SmartHit', 'SmartStorage', 'Constants'] 18 19 ## dicts mapping python types to lucene-friendly strings 20 str2type=dict((t.__name__, t) for t in (int, long, float, datetime.date, datetime.datetime, datetime.time)) 21 type2str=dict((t, n) for n, t in str2type.iteritems()) 22 23

24 -class SmartField(object):

25 """a field name & value, with instructions on how to save it into the 26 index 27 28 XXX need to add support for value lists. Lucene supports multiple fields 29 of the same name in a single doc, shouldn't be too hard. 30 31 See 32 U{Lucene documentation<http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html>} 33 and nested classes for documentation on C{store, index & termvector}. 34 These values may either be attributes of L{Constants} or corresonding 35 string/bool values. 36 37 @ivar name: name of the L{Field} 38 @type name: String 39 40 @ivar value: value of the L{Field}. This is filled in by L{SmartDoc} 41 @type value: basic python types 42 43 @ivar alltext: should the value be included in a default field search 44 @type alltext: Boolean 45 46 @ivar store: how to store the document. One of C{yes, no, compress} 47 @type store: string, bool or Lucene constant 48 49 @ivar index: how to index the document. 50 One of C{no, tokenized, untokenized, no_norms} 51 @type index: string, bool or Lucene constant 52 53 @ivar termvector: what sort of term vectors to generate for the document. 54 One of C{yes, no, with_offsets, with_positions, with_positions_and_offsets} 55 @type termvector: string, bool or Lucene constant 56 """ 57 58 __slots__=['name', 'value', 'store', 'index', 'termvector', 'alltext'] 59 60 ## default storage instructions. These may not be used, depending on value 61 ## type 62 _default_store=Constants.STORE_NO 63 _default_termvector=Constants.TERMVECTOR_NO 64 _default_alltext=True 65

66 - def __init__(self, value, 67 store=None, index=None, termvector=None, alltext=None):

68 69 if isinstance(value, str): 70 value = unicode(value) 71 72 self.value=value 73 74 ## convert to Lucene constants for store, index & termvector. Use 75 ## reasonable defaults based on value type if None specified. 76 77 if store is None: 78 store=self._default_store 79 store=Constants.toConstant(store, "store") 80 self.store=store 81 82 if termvector is None: 83 termvector=self._default_termvector 84 termvector=Constants.toConstant(termvector, "termvector") 85 self.termvector=termvector 86 87 if index is None: 88 if isinstance(value, unicode): 89 # tokenize string like things by default 90 index=Constants.INDEX_TOKENIZED 91 else: 92 index=Constants.INDEX_UNTOKENIZED 93 index=Constants.toConstant(index, "index") 94 self.index=index 95 96 if alltext is None: 97 alltext=self._default_alltext 98 assert isinstance(alltext, bool) 99 100 if isinstance(value, basestring): 101 self.alltext=alltext 102 else: 103 ## it doesn't make sense to try to throw non-text fields into 104 ## alltext 105 self.alltext=False

106

107 - def toField(self):

108 """ 109 @returns: a Lucene internal representation of this SmartField 110 @rtype: L{Field} 111 """ 112 return Field(self.name, conversion.toLucene(self.value), 113 self.store, self.index, self.termvector)

114

115 -class SmartDoc(object):

116 """a document built from L{SmartField}s 117 118 @ivar fields: fields in this document 119 @type fields: dict of L{SmartField} 120 """ 121 122 __slots__=['fields'] 123 124 ## how should we store termvectors for the __altext__ field 125 _alltext_termvector=Constants.TERMVECTOR_NO 126

127 - def __init__(self, fields):

128 assert isinstance(fields, dict) 129 # populate field.name 130 for name, field in fields.iteritems(): 131 assert isinstance(field, SmartField) 132 field.name=name 133 self.fields=fields

134

135 - def toDocument(self):

136 """ 137 @returns: a Lucene internal representation of this SmartDoc. 138 @rtype: L{Document} 139 """ 140 doc=Document() 141 142 ## list of values to include in default __alltext__ field 143 alltexts=[] 144 145 ## map of field.name => string of python type for non-unicode fields 146 typemap={} 147 148 ## for each field, add to alltext and save typemap info if necessary 149 for f in self.fields.itervalues(): 150 assert isinstance(f, SmartField) 151 doc.add(f.toField()) 152 if f.alltext: 153 alltexts.append(f.value) 154 if not isinstance(f.value, basestring): 155 typemap[f.name]=type2str[type(f.value)] 156 157 if alltexts: 158 ## build __alltext__ field 159 field=SmartField(u"\n".join(alltexts), 160 store=Constants.STORE_NO, 161 index=Constants.INDEX_TOKENIZED, 162 termvector=self._alltext_termvector) 163 field.name="__alltext__" # should match SmartStorage.query_parser 164 doc.add(field.toField(),) 165 166 if typemap: 167 ## serialize the typemap. We use repr, since it's purely a dict of 168 ## strings at this point 169 field=SmartField(unicode(repr(typemap)), 170 store=Constants.STORE_YES, 171 index=Constants.INDEX_NO) 172 field.name="__typemap__" 173 doc.add(field.toField()) 174 175 return doc

176

177 -class SmartHit(object):

178 """a search result 179 180 Attributes are expensive to calculate and ofent unneeded if skipping hits, 181 so we do it lazily. 182 """ 183 184 __slots__=['_hit', '_fields', '_score'] 185

186 - def __init__(self, hit):

187 self._hit=Hit.cast_(hit)

188 189 @property

190 - def score(self):

191 """relevance score 192 193 @type: float 194 """ 195 try: 196 return self._score 197 except AttributeError: 198 self._score=self._hit.getScore() 199 return self._score

200

201 - def fields(self, fields=None):

202 """ 203 map of field name=>value 204 205 @type: dict 206 """ 207 try: 208 return self._fields 209 except AttributeError: 210 ## typemap will be field.name() => python type 211 raw_typemap=self._hit.get('__typemap__') 212 if raw_typemap is not None: 213 ## raw_typemap is a repr()'d dict of strings. eval() and 214 ## convert back to the type() 215 typemap=dict((n, str2type[t]) for n, t in eval(raw_typemap).iteritems()) 216 else: 217 typemap={} 218 219 ## need to retrieve the doc to get a list of fields 220 doc=self._hit.getDocument() 221 222 ## f.name() is expensive. do it once 223 ## XXX this should filter the desired fields, see LuceneCollection.__resultFields 224 if fields is None: 225 fieldobjs = imap(Field.cast_, doc.getFields()) 226 else: 227 fieldobjs = (doc.getField(name) for name in fields) 228 raw_fields=((f.name(), f.stringValue()) for f in fieldobjs) 229 230 if not typemap: 231 convert_fields=((name, value) for name, value in raw_fields 232 if name != '__typemap__') 233 else: 234 convert_fields=((name, 235 conversion.fromLucene(value, typemap.get(name, unicode))) 236 for name, value in raw_fields if name != '__typemap__') 237 238 self._fields=dict(convert_fields) 239 return self._fields

240

241 -class SmartStorage(object):

242 """A wrapper around a lucene index with a smarter API 243 244 @ivar analyzer: a Lucene analyzer 245 @type analyzer: L{lucene.Analyzer} 246 247 @ivar directory: the lucene Directory 248 @type directory: L{FSDirectory} 249 250 @ivar enable_bulk_writes: if true, configure writer optimized for bulk 251 loading. If false, configure for updates. Defaults to False. 252 @type enable_bulk_writes: bool 253 """ 254 255 logger=logging.getLogger("SmartStorage") 256

257 - def __init__(self, index_dir=None, create=False):

258 """ 259 @arg create: create the index if it doesn't exist 260 @type create: bool 261 """ 262 263 if index_dir is not None: 264 segments_file_exists = os.path.exists(os.path.join(index_dir, 'segments.gen')) 265 if create: 266 assert not segments_file_exists, "lucene index exists: %s"%index_dir 267 os.makedirs(index_dir) 268 self.logger.warn("Created %s", index_dir) 269 else: 270 assert segments_file_exists 271 self.logger.info("Opened %s", index_dir) 272 273 # XXX should provide a way to pass these in to constructor 274 self.analyzer=StandardAnalyzer() 275 276 ## FSDirectory is faster than MMapDirectory, per some random email 277 ## from google: 278 ## http://mail-archives.apache.org/mod_mbox/lucene-java-user/200510.mbox/<200510110916.03843.paul.elschot%40xs4all.nl> 279 ## ah, science 280 ## XXX it's got concurrency issues tho 281 if index_dir is None: 282 self.directory = RAMDirectory() 283 create = True 284 else: 285 self.directory = FSDirectory.getDirectory(index_dir, create) 286 287 self.writer = IndexWriter(self.directory, self.analyzer, create) 288 self._searcher = IndexSearcher(self.directory) 289 self.enable_bulk_writes = False 290 self.configureWriter(self.writer)

291 292 @property

293 - def index_dir(self):

294 """@ivar index_dir: where the lucene index lives""" 295 if FSDirectory.instance_(self.directory): 296 return FSDirectory.cast_(self.directory).getFile().getCanonicalPath()

297

298 - def close(self):

299 """close all resources""" 300 self._searcher.close() 301 self.writer.close() 302 self.directory.close() 303 self.logger.info("Closed %s", self.index_dir)

304 305 @property

306 - def searcher(self):

307 """return fresh L{IndexSearcher}""" 308 # XXX this is not thread safe; it would be if objects called close in their destructor 309 # this check is faster than always opening a new Searcher due to caching 310 self.writer.flush() 311 if not self._searcher.getIndexReader().isCurrent(): 312 self._searcher.close() 313 self._searcher = IndexSearcher(self.directory) 314 return self._searcher

315 316 @property

317 - def reader(self):

318 """@type L{IndexReader}""" 319 return self.searcher.getIndexReader()

320

321 - def optimize(self):

322 """Optimize the index.""" 323 self.writer.optimize()

324

325 - def configureWriter(self, writer):

326 """configure an L{IndexWriter}. 327 328 @arg writer: the new IndexWriter 329 @type writer: L{IndexWriter} 330 """ 331 ## optimize for reading, not writing: more frequent merges than 332 ## default of 10. 2 comes from Doug Cutting: 333 ## http://www.opensubscriber.com/message/lucene-user%40jakarta.apache.org/803308.html 334 writer.setMergeFactor(2) 335 336 ## Multifile, not compound file, is the default, but we 337 ## set it here for explicitness. Lucene in Action B.3.2 338 ## claims that compound files are 5-10% slower for writing, 339 ## and <PyLucene>/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py, 340 ## run as part of the test suite, actually asserts that 341 ## compound time > multifile time (self.assert_(cTiming > mTiming)) 342 ## Note that Lucene will convert existing files from compound 343 ## to multifile, or vice versa, at open time. 344 ## 345 ## Use compound files to avoid opening zillions of files hitting a bug 346 ## in Python (and select limits, if we were using select). 347 writer.setUseCompoundFile(True) 348 349 if self.enable_bulk_writes: 350 self.configureBulkWriter(writer) 351 else: 352 self.configureUpdatingWriter(writer)

353 354 @staticmethod

355 - def configureUpdatingWriter(writer):

356 """configure an L{IndexWriter} for updates. 357 358 @arg writer: the new IndexWriter 359 @type writer: L{IndexWriter} 360 """ 361 362 ## explicitly set merge factor to 2. 2 comes from Doug Cutting: 363 ## http://www.opensubscriber.com/message/lucene-user%40jakarta.apache.org/803308.html 364 writer.setMergeFactor(2)

365 366 @staticmethod

367 - def configureBulkWriter(writer):

368 """configure an L{IndexWriter} for bulk loading. 369 370 @arg writer: the new IndexWriter 371 @type writer: L{IndexWriter} 372 """ 373 ## set merge factor up from default 374 writer.setMergeFactor(20) 375 376 ## set "the minimal number of documents required before the buffered 377 ## in-memory documents are merg[ed] and a new Segment is created." 378 379 ## Make sure this doesn't use too much memory: if a doc is ~1K, 380 ## then buffering should be ~10M, which sounds very reasonable, but 381 ## I could be wrong. This means we'll have fairly big segments, 382 ## and possibly fairly long pauses when merging. 383 ## 384 ## 10000 seemed to chew through RAM, so dialed it back to 200, which 385 ## results in about 700M usage. 386 writer.setMaxBufferedDocs(200)

387

388 - def search(self, query):

389 """search the index 390 391 @arg query: a Lucene U{query<http://lucene.apache.org/java/docs/queryparsersyntax.html>} 392 @type query: String or L{lucene.Query} 393 394 @rtype: L{HitsWrapper} of L{SmartHit}s 395 """ 396 if isinstance(query, basestring): 397 # default field should match L{Smarts.SmartDoc} 398 query = QueryParser("__alltext__", self.analyzer).parse(query) 399 400 assert Query.instance_(query) 401 hits=self.searcher.search(query) 402 return HitsWrapper(hits, SmartHit)

403

404 - def insert(self, doc):

405 """insert a document 406 407 @arg doc: the document to insert 408 @type doc: L{SmartDoc} 409 """ 410 assert isinstance(doc, SmartDoc) 411 self.writer.addDocument(doc.toDocument())

412

413 - def delete(self, name_or_term, value=None):

414 """delete documents from lucene index matching Term 415 416 @arg name_or_term: field name or Term to delete 417 @type name_or_term: string or L{Term} 418 419 @arg value: field value, or None if a passing a L{Term} 420 @type value: unicode or None 421 """ 422 if Term.instance_(name_or_term): 423 assert value is None 424 term=name_or_term 425 else: 426 assert isinstance(name_or_term, basestring) 427 assert isinstance(value, basestring) 428 term=Term(name_or_term, value) 429 ## XXX this can/should be writer, as of Pylucene r331. 430 ## see http://lists.osafoundation.org/pipermail/pylucene-dev/2007-May/001797.html 431 return self.writer.deleteDocuments(term)

432

433 - def getAllTerms(self, field, include_counts=True):

434 """yield all terms for a given field 435 436 @arg field: name of the field 437 @type field: String 438 439 @arg include_counts: should value frequencies be included 440 @type include_counts: Boolean 441 442 @returns: if include_counts is True, tuples of (term, count). If 443 false, just list of terms 444 """ 445 if include_counts: 446 return TermIterator.termIteratorCount(self.reader, field, count_needed=True) 447 else: 448 return TermIterator.termIterator(self.reader, field)

449

Source Code for Module grassyknoll.backend.lucene.Smarts