| Home | Trees | Indices | Help |
|
|---|
|
|
1 """a higher level L{lucene} wrapper with intelligence"""
2 import os.path
3 import logging
4 import datetime
5 from itertools import imap
6
7 from lucene import (Field, Document, Hit, Term, IndexWriter, IndexReader,
8 IndexSearcher, Query, FSDirectory, RAMDirectory, Document, StandardAnalyzer,
9 QueryParser)
10
11 from HitsWrapper import HitsWrapper
12 import TermIterator
13 from LuceneConstants import Constants
14 import conversion
15
16
17 __all__=['SmartDoc', 'SmartField', 'SmartHit', 'SmartStorage', 'Constants']
18
19 ## dicts mapping python types to lucene-friendly strings
20 str2type=dict((t.__name__, t) for t in (int, long, float, datetime.date, datetime.datetime, datetime.time))
21 type2str=dict((t, n) for n, t in str2type.iteritems())
22
23
25 """a field name & value, with instructions on how to save it into the
26 index
27
28 XXX need to add support for value lists. Lucene supports multiple fields
29 of the same name in a single doc, shouldn't be too hard.
30
31 See
32 U{Lucene documentation<http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html>}
33 and nested classes for documentation on C{store, index & termvector}.
34 These values may either be attributes of L{Constants} or corresonding
35 string/bool values.
36
37 @ivar name: name of the L{Field}
38 @type name: String
39
40 @ivar value: value of the L{Field}. This is filled in by L{SmartDoc}
41 @type value: basic python types
42
43 @ivar alltext: should the value be included in a default field search
44 @type alltext: Boolean
45
46 @ivar store: how to store the document. One of C{yes, no, compress}
47 @type store: string, bool or Lucene constant
48
49 @ivar index: how to index the document.
50 One of C{no, tokenized, untokenized, no_norms}
51 @type index: string, bool or Lucene constant
52
53 @ivar termvector: what sort of term vectors to generate for the document.
54 One of C{yes, no, with_offsets, with_positions, with_positions_and_offsets}
55 @type termvector: string, bool or Lucene constant
56 """
57
58 __slots__=['name', 'value', 'store', 'index', 'termvector', 'alltext']
59
60 ## default storage instructions. These may not be used, depending on value
61 ## type
62 _default_store=Constants.STORE_NO
63 _default_termvector=Constants.TERMVECTOR_NO
64 _default_alltext=True
65
68
69 if isinstance(value, str):
70 value = unicode(value)
71
72 self.value=value
73
74 ## convert to Lucene constants for store, index & termvector. Use
75 ## reasonable defaults based on value type if None specified.
76
77 if store is None:
78 store=self._default_store
79 store=Constants.toConstant(store, "store")
80 self.store=store
81
82 if termvector is None:
83 termvector=self._default_termvector
84 termvector=Constants.toConstant(termvector, "termvector")
85 self.termvector=termvector
86
87 if index is None:
88 if isinstance(value, unicode):
89 # tokenize string like things by default
90 index=Constants.INDEX_TOKENIZED
91 else:
92 index=Constants.INDEX_UNTOKENIZED
93 index=Constants.toConstant(index, "index")
94 self.index=index
95
96 if alltext is None:
97 alltext=self._default_alltext
98 assert isinstance(alltext, bool)
99
100 if isinstance(value, basestring):
101 self.alltext=alltext
102 else:
103 ## it doesn't make sense to try to throw non-text fields into
104 ## alltext
105 self.alltext=False
106
108 """
109 @returns: a Lucene internal representation of this SmartField
110 @rtype: L{Field}
111 """
112 return Field(self.name, conversion.toLucene(self.value),
113 self.store, self.index, self.termvector)
114
116 """a document built from L{SmartField}s
117
118 @ivar fields: fields in this document
119 @type fields: dict of L{SmartField}
120 """
121
122 __slots__=['fields']
123
124 ## how should we store termvectors for the __altext__ field
125 _alltext_termvector=Constants.TERMVECTOR_NO
126
128 assert isinstance(fields, dict)
129 # populate field.name
130 for name, field in fields.iteritems():
131 assert isinstance(field, SmartField)
132 field.name=name
133 self.fields=fields
134
136 """
137 @returns: a Lucene internal representation of this SmartDoc.
138 @rtype: L{Document}
139 """
140 doc=Document()
141
142 ## list of values to include in default __alltext__ field
143 alltexts=[]
144
145 ## map of field.name => string of python type for non-unicode fields
146 typemap={}
147
148 ## for each field, add to alltext and save typemap info if necessary
149 for f in self.fields.itervalues():
150 assert isinstance(f, SmartField)
151 doc.add(f.toField())
152 if f.alltext:
153 alltexts.append(f.value)
154 if not isinstance(f.value, basestring):
155 typemap[f.name]=type2str[type(f.value)]
156
157 if alltexts:
158 ## build __alltext__ field
159 field=SmartField(u"\n".join(alltexts),
160 store=Constants.STORE_NO,
161 index=Constants.INDEX_TOKENIZED,
162 termvector=self._alltext_termvector)
163 field.name="__alltext__" # should match SmartStorage.query_parser
164 doc.add(field.toField(),)
165
166 if typemap:
167 ## serialize the typemap. We use repr, since it's purely a dict of
168 ## strings at this point
169 field=SmartField(unicode(repr(typemap)),
170 store=Constants.STORE_YES,
171 index=Constants.INDEX_NO)
172 field.name="__typemap__"
173 doc.add(field.toField())
174
175 return doc
176
178 """a search result
179
180 Attributes are expensive to calculate and ofent unneeded if skipping hits,
181 so we do it lazily.
182 """
183
184 __slots__=['_hit', '_fields', '_score']
185
188
189 @property
191 """relevance score
192
193 @type: float
194 """
195 try:
196 return self._score
197 except AttributeError:
198 self._score=self._hit.getScore()
199 return self._score
200
202 """
203 map of field name=>value
204
205 @type: dict
206 """
207 try:
208 return self._fields
209 except AttributeError:
210 ## typemap will be field.name() => python type
211 raw_typemap=self._hit.get('__typemap__')
212 if raw_typemap is not None:
213 ## raw_typemap is a repr()'d dict of strings. eval() and
214 ## convert back to the type()
215 typemap=dict((n, str2type[t]) for n, t in eval(raw_typemap).iteritems())
216 else:
217 typemap={}
218
219 ## need to retrieve the doc to get a list of fields
220 doc=self._hit.getDocument()
221
222 ## f.name() is expensive. do it once
223 ## XXX this should filter the desired fields, see LuceneCollection.__resultFields
224 if fields is None:
225 fieldobjs = imap(Field.cast_, doc.getFields())
226 else:
227 fieldobjs = (doc.getField(name) for name in fields)
228 raw_fields=((f.name(), f.stringValue()) for f in fieldobjs)
229
230 if not typemap:
231 convert_fields=((name, value) for name, value in raw_fields
232 if name != '__typemap__')
233 else:
234 convert_fields=((name,
235 conversion.fromLucene(value, typemap.get(name, unicode)))
236 for name, value in raw_fields if name != '__typemap__')
237
238 self._fields=dict(convert_fields)
239 return self._fields
240
242 """A wrapper around a lucene index with a smarter API
243
244 @ivar analyzer: a Lucene analyzer
245 @type analyzer: L{lucene.Analyzer}
246
247 @ivar directory: the lucene Directory
248 @type directory: L{FSDirectory}
249
250 @ivar enable_bulk_writes: if true, configure writer optimized for bulk
251 loading. If false, configure for updates. Defaults to False.
252 @type enable_bulk_writes: bool
253 """
254
255 logger=logging.getLogger("SmartStorage")
256
258 """
259 @arg create: create the index if it doesn't exist
260 @type create: bool
261 """
262
263 if index_dir is not None:
264 segments_file_exists = os.path.exists(os.path.join(index_dir, 'segments.gen'))
265 if create:
266 assert not segments_file_exists, "lucene index exists: %s"%index_dir
267 os.makedirs(index_dir)
268 self.logger.warn("Created %s", index_dir)
269 else:
270 assert segments_file_exists
271 self.logger.info("Opened %s", index_dir)
272
273 # XXX should provide a way to pass these in to constructor
274 self.analyzer=StandardAnalyzer()
275
276 ## FSDirectory is faster than MMapDirectory, per some random email
277 ## from google:
278 ## http://mail-archives.apache.org/mod_mbox/lucene-java-user/200510.mbox/<200510110916.03843.paul.elschot%40xs4all.nl>
279 ## ah, science
280 ## XXX it's got concurrency issues tho
281 if index_dir is None:
282 self.directory = RAMDirectory()
283 create = True
284 else:
285 self.directory = FSDirectory.getDirectory(index_dir, create)
286
287 self.writer = IndexWriter(self.directory, self.analyzer, create)
288 self._searcher = IndexSearcher(self.directory)
289 self.enable_bulk_writes = False
290 self.configureWriter(self.writer)
291
292 @property
294 """@ivar index_dir: where the lucene index lives"""
295 if FSDirectory.instance_(self.directory):
296 return FSDirectory.cast_(self.directory).getFile().getCanonicalPath()
297
299 """close all resources"""
300 self._searcher.close()
301 self.writer.close()
302 self.directory.close()
303 self.logger.info("Closed %s", self.index_dir)
304
305 @property
307 """return fresh L{IndexSearcher}"""
308 # XXX this is not thread safe; it would be if objects called close in their destructor
309 # this check is faster than always opening a new Searcher due to caching
310 self.writer.flush()
311 if not self._searcher.getIndexReader().isCurrent():
312 self._searcher.close()
313 self._searcher = IndexSearcher(self.directory)
314 return self._searcher
315
316 @property
320
324
326 """configure an L{IndexWriter}.
327
328 @arg writer: the new IndexWriter
329 @type writer: L{IndexWriter}
330 """
331 ## optimize for reading, not writing: more frequent merges than
332 ## default of 10. 2 comes from Doug Cutting:
333 ## http://www.opensubscriber.com/message/lucene-user%40jakarta.apache.org/803308.html
334 writer.setMergeFactor(2)
335
336 ## Multifile, not compound file, is the default, but we
337 ## set it here for explicitness. Lucene in Action B.3.2
338 ## claims that compound files are 5-10% slower for writing,
339 ## and <PyLucene>/samples/LuceneInAction/lia/indexing/CompoundVersusMultiFileIndexTest.py,
340 ## run as part of the test suite, actually asserts that
341 ## compound time > multifile time (self.assert_(cTiming > mTiming))
342 ## Note that Lucene will convert existing files from compound
343 ## to multifile, or vice versa, at open time.
344 ##
345 ## Use compound files to avoid opening zillions of files hitting a bug
346 ## in Python (and select limits, if we were using select).
347 writer.setUseCompoundFile(True)
348
349 if self.enable_bulk_writes:
350 self.configureBulkWriter(writer)
351 else:
352 self.configureUpdatingWriter(writer)
353
354 @staticmethod
356 """configure an L{IndexWriter} for updates.
357
358 @arg writer: the new IndexWriter
359 @type writer: L{IndexWriter}
360 """
361
362 ## explicitly set merge factor to 2. 2 comes from Doug Cutting:
363 ## http://www.opensubscriber.com/message/lucene-user%40jakarta.apache.org/803308.html
364 writer.setMergeFactor(2)
365
366 @staticmethod
368 """configure an L{IndexWriter} for bulk loading.
369
370 @arg writer: the new IndexWriter
371 @type writer: L{IndexWriter}
372 """
373 ## set merge factor up from default
374 writer.setMergeFactor(20)
375
376 ## set "the minimal number of documents required before the buffered
377 ## in-memory documents are merg[ed] and a new Segment is created."
378
379 ## Make sure this doesn't use too much memory: if a doc is ~1K,
380 ## then buffering should be ~10M, which sounds very reasonable, but
381 ## I could be wrong. This means we'll have fairly big segments,
382 ## and possibly fairly long pauses when merging.
383 ##
384 ## 10000 seemed to chew through RAM, so dialed it back to 200, which
385 ## results in about 700M usage.
386 writer.setMaxBufferedDocs(200)
387
389 """search the index
390
391 @arg query: a Lucene U{query<http://lucene.apache.org/java/docs/queryparsersyntax.html>}
392 @type query: String or L{lucene.Query}
393
394 @rtype: L{HitsWrapper} of L{SmartHit}s
395 """
396 if isinstance(query, basestring):
397 # default field should match L{Smarts.SmartDoc}
398 query = QueryParser("__alltext__", self.analyzer).parse(query)
399
400 assert Query.instance_(query)
401 hits=self.searcher.search(query)
402 return HitsWrapper(hits, SmartHit)
403
405 """insert a document
406
407 @arg doc: the document to insert
408 @type doc: L{SmartDoc}
409 """
410 assert isinstance(doc, SmartDoc)
411 self.writer.addDocument(doc.toDocument())
412
414 """delete documents from lucene index matching Term
415
416 @arg name_or_term: field name or Term to delete
417 @type name_or_term: string or L{Term}
418
419 @arg value: field value, or None if a passing a L{Term}
420 @type value: unicode or None
421 """
422 if Term.instance_(name_or_term):
423 assert value is None
424 term=name_or_term
425 else:
426 assert isinstance(name_or_term, basestring)
427 assert isinstance(value, basestring)
428 term=Term(name_or_term, value)
429 ## XXX this can/should be writer, as of Pylucene r331.
430 ## see http://lists.osafoundation.org/pipermail/pylucene-dev/2007-May/001797.html
431 return self.writer.deleteDocuments(term)
432
434 """yield all terms for a given field
435
436 @arg field: name of the field
437 @type field: String
438
439 @arg include_counts: should value frequencies be included
440 @type include_counts: Boolean
441
442 @returns: if include_counts is True, tuples of (term, count). If
443 false, just list of terms
444 """
445 if include_counts:
446 return TermIterator.termIteratorCount(self.reader, field, count_needed=True)
447 else:
448 return TermIterator.termIterator(self.reader, field)
449
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Mon Mar 10 05:37:19 2008 | http://epydoc.sourceforge.net |