Uploaded image for project: 'Zoie'
  1. Zoie
  2. ZOIE-106

ZoieSegmentTermDocs doesn't reset the _nextDelDoc cache after seek

    Details

    • Type: Bug
    • Status: Closed
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: 3.1.0
    • Fix Version/s: 3.2.0
    • Component/s: core
    • Labels:
      None

      Description

      ZoieSegmentTermDocs doesn't reset the _nextDelDoc cache after seek. This may cause bug if the ZoieSegmentTermDocs are reuse in one query for multi term, such as term prefix query, in which case it may treat some deleted doc as nondeleted.

      Here is my fix

      ZoieSegmentTermDocs.java
      package proj.zoie.impl.indexing.internal;
      
      import java.io.IOException;
      
      import org.apache.lucene.index.Term;
      import org.apache.lucene.index.TermDocs;
      import org.apache.lucene.index.TermEnum;
      import org.apache.lucene.index.FilterIndexReader.FilterTermDocs;
      import org.apache.lucene.search.DocIdSet;
      import org.apache.lucene.search.DocIdSetIterator;
      
      public class ZoieSegmentTermDocs extends FilterTermDocs {
      	private final DocIdSet delSet;
      	private int _firstDelDoc = -1;
      	private DocIdSetIterator _delSetIterator;
          private int _nextDelDoc;
          
      	public ZoieSegmentTermDocs(TermDocs in,DocIdSet delSet) throws IOException{
      		super(in);
      		this.delSet = delSet;
      		resetDelIter();
      		_firstDelDoc = _nextDelDoc;
      	}
      	
      //	private ZoieSegmentTermDocs(TermDocs in, DocIdSetIterator delSetIterator) throws IOException {
      //		super(in);
      //		_delSetIterator = delSetIterator;
      //		_nextDelDoc=_delSetIterator.nextDoc();
      //	}
      	
      	@Override
      	public void seek(Term term) throws IOException {
      		resetDelIter();
      		super.seek(term);
      	}
      
      	@Override
      	public void seek(TermEnum termEnum) throws IOException {
      		resetDelIter();
      		super.seek(termEnum);
      	}
      	
      	private void resetDelIter() throws IOException {
      		if (_firstDelDoc != _nextDelDoc) {
      			_delSetIterator = delSet.iterator();
      			_nextDelDoc = _delSetIterator.nextDoc();
      		}
      	}
      
      	public boolean next() throws IOException {
      		boolean hasNext=in.next();
      		if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){
                int currID =in.doc();
      			while(hasNext){
      				if (currID<_nextDelDoc){
      					return hasNext;
      				}
      				else{
      					if (currID == _nextDelDoc){
      						hasNext=in.next();
      			            currID =in.doc();
      					}
      					_nextDelDoc = _delSetIterator.advance(currID);
      				}
      			}
      		}
      		return hasNext;
      	}
      
      	public int read(final int[] docs, final int[] freqs) throws IOException {
      		if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){
      			int i = 0;
      			while (i < docs.length) {
      				if (!in.next())
      					return i;
      	
      				int doc = in.doc();
      				if (doc<_nextDelDoc){
      					docs[i] = doc;
      					freqs[i] = in.freq();
      					i++;
      				}
      				else{
      				  _nextDelDoc = _delSetIterator.advance(doc);
      					if (doc==_nextDelDoc){
      						continue;
      					}
      					else{
      						docs[i] = doc;
      						freqs[i] = in.freq();
      						i++;
      					}
      				}
      			}
      			return i;
      		}
      		else{
      		  return in.read(docs,freqs);
      		}
      	}
      
      	public boolean skipTo(int i) throws IOException {
      		if (!in.skipTo(i))
      			return false;
      
      		if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){
      		  int doc = in.doc();
      		  if (doc<_nextDelDoc) return true;
      		  _nextDelDoc = _delSetIterator.advance(doc);
      		  if (doc==_nextDelDoc) return next();
      		}
      		return true;
      	}
      }
      

      Here is the code repo the bug

      TermDocRangeTest.java
      package com.hzb.test;
      
      import java.io.File;
      import java.util.Arrays;
      import java.util.List;
      
      import org.apache.lucene.analysis.standard.StandardAnalyzer;
      import org.apache.lucene.document.Document;
      import org.apache.lucene.document.Field;
      import org.apache.lucene.document.Field.Index;
      import org.apache.lucene.document.Field.Store;
      import org.apache.lucene.document.Fieldable;
      import org.apache.lucene.index.IndexReader;
      import org.apache.lucene.index.MultiReader;
      import org.apache.lucene.queryParser.QueryParser;
      import org.apache.lucene.search.IndexSearcher;
      import org.apache.lucene.search.Query;
      import org.apache.lucene.search.TopDocs;
      import org.apache.lucene.util.Version;
      
      import proj.zoie.api.DataConsumer.DataEvent;
      import proj.zoie.api.indexing.ZoieIndexable;
      import proj.zoie.api.indexing.ZoieIndexableInterpreter;
      import proj.zoie.impl.indexing.ZoieConfig;
      import proj.zoie.impl.indexing.ZoieSystem;
      
      public class TermDocRangeTest {
      	public static void main(String[] args) throws Exception {
      		ZoieConfig zConfig = new ZoieConfig();
      		
      		ZoieSystem<?, DataDoc> zoie = ZoieSystem.buildDefaultInstance(
      				new File("./data"),
      				new DefaultInterpreter(),
      				zConfig);
      		zoie.start();
      		
      		Document d1 = new Document();
      		Fieldable f1 = new Field("num", "abcdef", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
      		d1.add(f1);
      		
      		Document d2 = new Document();
      		Fieldable f2 = new Field("num", "abcd", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
      		d2.add(f2);
      		
      		Document d3 = new Document();
      		Fieldable f3 = new Field("num", "abcde", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
      		d3.add(f3);
      		
      		
      		DataEvent<DataDoc> de1 = new DataEvent<DataDoc>(new DataDoc(1, d1), "1");
      		DataEvent<DataDoc> de2 = new DataEvent<DataDoc>(new DataDoc(2, d2), "1");
      		DataEvent<DataDoc> de3 = new DataEvent<DataDoc>(new DataDoc(3, d3), "1");
      		
      		zoie.consume(Arrays.asList(de1, de2, de3));
      		zoie.flushEvents(10000);
      
      		List<?> readerList = zoie.getIndexReaders();
      		// combine the readers
      		MultiReader reader = new MultiReader(readerList.toArray(new IndexReader[readerList.size()]),false);
      		// do search
      		IndexSearcher searcher = new IndexSearcher(reader);
      		QueryParser parser = new QueryParser(Version.LUCENE_35, "num", new StandardAnalyzer(Version.LUCENE_35));
      		Query q = parser.parse("num:abc*");
      		TopDocs ret = searcher.search(q, 100);
      		System.out.println(ret.totalHits);
      		searcher.close();
      		reader.close();
      		zoie.returnIndexReaders((List) readerList);
      		
      		de1 = new DataEvent<DataDoc>(new DataDoc(1), "2");
      		de2 = new DataEvent<DataDoc>(new DataDoc(2), "2");
      		de3 = new DataEvent<DataDoc>(new DataDoc(3), "2");
      		zoie.consume(Arrays.asList(de1, de2, de3));
      		
      		zoie.flushEventsToMemoryIndex(10000);
      		
      		readerList = zoie.getIndexReaders();
      		// combine the readers
      		reader = new MultiReader(readerList.toArray(new IndexReader[readerList.size()]),false);
      		// do search
      		searcher = new IndexSearcher(reader);
      		ret = searcher.search(q, 100);
      		System.out.println(ret.totalHits);
      		
      		System.exit(0);
      	}
      }
      
      class DefaultInterpreter implements ZoieIndexableInterpreter<DataDoc> {
      
      	@Override
      	public ZoieIndexable convertAndInterpret(DataDoc src) {
      		return src;
      	}
      	
      }
      class DataDoc implements ZoieIndexable {
      	private long uid;
      	private Document doc;
      	private boolean valid;
      	public DataDoc(long uid, Document doc) {
      		this.uid = uid;
      		this.doc = doc;
      		this.valid = true;
      	}
      	
      	public DataDoc(long uid) {
      		this.uid = uid;
      		this.valid = false;
      	}
      	
      	
      	@Override
      	public long getUID() {
      		return uid;
      	}
      
      	@Override
      	public boolean isDeleted() {
      		return !valid;
      	}
      
      	@Override
      	public boolean isSkip() {
      		return false;
      	}
      
      	@Override
      	public IndexingReq[] buildIndexingReqs() {
      		return new IndexingReq[]{new IndexingReq(doc)};
      	}
      
      	@Override
      	public boolean isStorable() {
      		return false;
      	}
      
      	@Override
      	public byte[] getStoreValue() {
      		return null;
      	}
      	
      	
      }
      

        Attachments

          Activity

            People

            • Assignee:
              jwang John Wang
              Reporter:
              zhuobin.he Zhuobin He
            • Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: