Package grassyknoll :: Package backend :: Package lucene :: Module LuceneCollection
[hide private]

Source Code for Module grassyknoll.backend.lucene.LuceneCollection

  1  """contains L{LuceneCollection}, a L{Collection.Collection} based on L{lucene}""" 
  2  import shutil 
  3  from lucene import Term, TermQuery 
  4  from OneOfQuery import OneOfQuery 
  5   
  6  import Smarts 
  7  from LuceneConstants import Constants 
  8  from grassyknoll.collection import Collection 
  9  from grassyknoll.lib import Norman 
 10   
 11  # XXX see http://wiki.apache.org/solr/StandardRequestHandler for more things 
 12  # to stuff in metadata 
 13   
14 -class LuceneCollection(Collection.Collection):
15 """a L{Collection.Collection} with L{Smarts} 16 17 This class implements all of the L{Collection.Collection} methods. 18 19 L{Collection.CollectionResult}s returned include a C{__score__} field, a 20 measure of relevance. 21 22 L{Collection.CollectionResultSet}s returned include as metadata: 23 - C{count}: a count of the total number of matches 24 25 @ivar index_dir: the location of the index 26 @type index_dir: string 27 28 @ivar storage_normans: norman name => a Norman consisting 29 L{SmartFieldNorman}s describing how fields should be saved into the index 30 @type storage_normans: dict 31 32 @ivar default_storage_norman: a default L{storage_normans} to be used if 33 no norman is specified for a L{Collection.CollectionDocument} 34 @type default_storage_norman: L{Norman.ObjectNorman} 35 """ 36
37 - def __init__(self, index_dir, default_storage_norman, create=False, **storage_normans):
38 """ 39 @arg create: create the index if it doesn't exist 40 @type create: bool 41 """ 42 assert callable(default_storage_norman) 43 self.default_storage_norman=default_storage_norman 44 self.storage_normans=storage_normans 45 self.storage=Smarts.SmartStorage(index_dir, create)
46
47 - def __len__(self):
48 return self.storage.reader.numDocs()
49 50 @staticmethod
51 - def __oneIdQuery(id):
52 """@returns: a L{lucene.Query} for a single id""" 53 return TermQuery(Term('__id__', id))
54 55 @staticmethod
56 - def __manyIdQuery(ids):
57 """@returns: a L{lucene.Query} for many ids""" 58 return OneOfQuery('__id__', ids)
59 60 @Collection.addMetaData
61 - def delete(self, ids):
62 ## XXX we can pass a list of terms here, if SmartStorage supported it, 63 ## moving the loop into Java. see: 64 ## http://lucene.apache.org/java/2_1_0/api/org/apache/lucene/index/IndexWriter.html#deleteDocuments(org.apache.lucene.index.Term[]) 65 for uid in ids: 66 self.storage.delete('__id__', uid) 67 return Collection.CollectionIds(ids)
68 69 @Collection.addMetaData
70 - def retrieve(self, ids, fields=None):
71 if fields is not None and '__id__' not in fields: fields+=('__id__',) 72 if len(ids)==1: 73 hits=self.storage.search(self.__oneIdQuery(ids[0])) 74 assert len(hits) in (0, 1) 75 else: 76 hits=self.storage.search(self.__manyIdQuery(ids)) 77 assert len(hits) <= len(ids) 78 79 # XXX this should use __resultField? 80 results = [Collection.CollectionResult(hit.fields(fields)) for hit in hits] 81 return Collection.CollectionResultSet(results)
82 83 @Collection.addMetaData
84 - def create(self, docs):
85 ## delete docs: 86 self.delete([doc.id for doc in docs]) 87 88 ## insert docs 89 for doc in docs: 90 self.storage.insert(self.buildSmartDoc(doc)) 91 92 return Collection.CollectionIds([doc.id for doc in docs]) 93 94 ## XXX this can get pushed down into Smart*, see 95 ## http://lucene.apache.org/java/2_1_0/api/org/apache/lucene/index/IndexReader.html#document(int, org.apache.lucene.document.FieldSelector) 96 @staticmethod
97 - def __resultFields(hit, fields=None):
98 """builds fields for a L{Collection.CollectionResult} from a hit 99 100 @rtype: dict 101 """ 102 # just modify fields in place since we don't do anything else with the hit 103 d = hit.fields(fields) 104 # XXX can calculating score be supressed entirely? Solr does this 105 d['__score__'] = hit.score 106 return d
107 108 @Collection.addMetaData
109 - def searchQuery(self, q, start=0, stop=None, fields=None):
110 """search the collection. 111 112 start and stop are interpreted in Python L{slice} sense. 113 114 @arg q: a Lucene query string 115 @type q: unicode 116 117 @arg start: index of first result to return. 118 @type start: int 119 120 @arg stop: index of last+1 result to return. 121 @type stop: int 122 123 @arg fields: a sub-L{set} of fields that should be returned. Defaults 124 to None, meaning all available fields. 125 @type fields: set 126 127 @returns: the results, or None if not found 128 @rtype: L{CollectionResultSet} 129 """ 130 hits=self.storage.search(q) 131 132 results=[Collection.CollectionResult(self.__resultFields(h, fields)) 133 for h in hits[start:stop]] 134 135 ## XXX it'd be nice to return addt'l metadata - maxScore comes to 136 ## mind. That requires using a different Searcher.search method all 137 ## the way down in SmartStorage.search. Whatever returns a TopDocs() 138 ## instead of a Hits(). 139 return Collection.CollectionResultSet(results, {'count':len(hits)})
140 141 @Collection.addMetaData
142 - def list(self):
143 raw_ids=list(self.storage.getAllTerms('__id__', include_counts=False)) 144 return Collection.CollectionIds(raw_ids)
145
146 - def close(self):
147 self.storage.close()
148
149 - def cleanUp(self):
150 "Remove storage directory." 151 # XXX push down implmentation to SmartStorage 152 if self.storage.index_dir is not None: 153 shutil.rmtree(self.storage.index_dir)
154
155 - def buildSmartDoc(self, doc):
156 """ 157 @type doc: L{Collection.CollectionDocument} 158 159 @rtype: L{Smarts.SmartDoc} 160 """ 161 assert isinstance(doc, Collection.CollectionDocument) 162 norman = getattr(doc, 'norman', None) 163 norman = self.default_storage_norman if norman is None else self.storage_normans[norman] 164 return Smarts.SmartDoc(norman(doc))
165
166 -def SmartFieldNorman(store=None, index=None, termvector=None, alltext=None, 167 optional=False, filled=False, prohibited=False):
168 """a L{Norman.FunctionNorman} that generates L{Smarts.SmartField}s 169 170 See L{Smarts.SmartField} and L{Norman.Norman} for parameters. 171 """ 172 if store is not None: store=Constants.toConstant(store, 'store') 173 if index is not None: index=Constants.toConstant(index, 'index') 174 if termvector is not None: termvector=Constants.toConstant(termvector, 'termvector') 175 assert alltext is None or isinstance(alltext, bool) 176 177 return Norman.FunctionNorman(Smarts.SmartField, store=store, index=index, 178 termvector=termvector, alltext=alltext, 179 optional=optional, filled=filled, prohibited=prohibited)
180
181 -def InternalFieldNorman(optional=False, filled=True, prohibited=False):
182 """a L{SmartFieldNorman} with reasonable defaults for internally-used fields""" 183 return SmartFieldNorman(store=Constants.STORE_YES, 184 index=Constants.INDEX_UNTOKENIZED, 185 termvector=Constants.TERMVECTOR_NO, 186 alltext=False, 187 optional=optional, filled=filled, prohibited=prohibited)
188
189 -def LuceneNorman(unknown='error', optional=False, filled=False, prohibited=False):
190 """a reasonable default L{Norman.ObjectNorman} for use with L{LuceneCollection}. 191 192 Users should add addtional L{Norman}s as desired. 193 """ 194 norman=Norman.ObjectNorman(unknown=unknown, optional=optional, 195 filled=filled, prohibited=prohibited) 196 norman.__id__=InternalFieldNorman() 197 return norman
198