We're updating the issue view to help you get more done. 

ZoieSegmentTermDocs doesn't reset the _nextDelDoc cache after seek

Description

ZoieSegmentTermDocs doesn't reset the _nextDelDoc cache after seek. This may cause bug if the ZoieSegmentTermDocs are reuse in one query for multi term, such as term prefix query, in which case it may treat some deleted doc as nondeleted.

Here is my fix

ZoieSegmentTermDocs.java

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 package proj.zoie.impl.indexing.internal; import java.io.IOException; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.FilterIndexReader.FilterTermDocs; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; public class ZoieSegmentTermDocs extends FilterTermDocs { private final DocIdSet delSet; private int _firstDelDoc = -1; private DocIdSetIterator _delSetIterator; private int _nextDelDoc; public ZoieSegmentTermDocs(TermDocs in,DocIdSet delSet) throws IOException{ super(in); this.delSet = delSet; resetDelIter(); _firstDelDoc = _nextDelDoc; } // private ZoieSegmentTermDocs(TermDocs in, DocIdSetIterator delSetIterator) throws IOException { // super(in); // _delSetIterator = delSetIterator; // _nextDelDoc=_delSetIterator.nextDoc(); // } @Override public void seek(Term term) throws IOException { resetDelIter(); super.seek(term); } @Override public void seek(TermEnum termEnum) throws IOException { resetDelIter(); super.seek(termEnum); } private void resetDelIter() throws IOException { if (_firstDelDoc != _nextDelDoc) { _delSetIterator = delSet.iterator(); _nextDelDoc = _delSetIterator.nextDoc(); } } public boolean next() throws IOException { boolean hasNext=in.next(); if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){ int currID =in.doc(); while(hasNext){ if (currID<_nextDelDoc){ return hasNext; } else{ if (currID == _nextDelDoc){ hasNext=in.next(); currID =in.doc(); } _nextDelDoc = _delSetIterator.advance(currID); } } } return hasNext; } public int read(final int[] docs, final int[] freqs) throws IOException { if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){ int i = 0; while (i < docs.length) { if (!in.next()) return i; int doc = in.doc(); if (doc<_nextDelDoc){ docs[i] = doc; freqs[i] = in.freq(); i++; } else{ _nextDelDoc = _delSetIterator.advance(doc); if (doc==_nextDelDoc){ continue; } else{ docs[i] = doc; freqs[i] = in.freq(); i++; } } } return i; } else{ return in.read(docs,freqs); } } public boolean skipTo(int i) throws IOException { if (!in.skipTo(i)) return false; if (_nextDelDoc!=DocIdSetIterator.NO_MORE_DOCS){ int doc = in.doc(); if (doc<_nextDelDoc) return true; _nextDelDoc = _delSetIterator.advance(doc); if (doc==_nextDelDoc) return next(); } return true; } }

Here is the code repo the bug

TermDocRangeTest.java

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 package com.hzb.test; import java.io.File; import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.Version; import proj.zoie.api.DataConsumer.DataEvent; import proj.zoie.api.indexing.ZoieIndexable; import proj.zoie.api.indexing.ZoieIndexableInterpreter; import proj.zoie.impl.indexing.ZoieConfig; import proj.zoie.impl.indexing.ZoieSystem; public class TermDocRangeTest { public static void main(String[] args) throws Exception { ZoieConfig zConfig = new ZoieConfig(); ZoieSystem<?, DataDoc> zoie = ZoieSystem.buildDefaultInstance( new File("./data"), new DefaultInterpreter(), zConfig); zoie.start(); Document d1 = new Document(); Fieldable f1 = new Field("num", "abcdef", Store.YES, Index.NOT_ANALYZED_NO_NORMS); d1.add(f1); Document d2 = new Document(); Fieldable f2 = new Field("num", "abcd", Store.YES, Index.NOT_ANALYZED_NO_NORMS); d2.add(f2); Document d3 = new Document(); Fieldable f3 = new Field("num", "abcde", Store.YES, Index.NOT_ANALYZED_NO_NORMS); d3.add(f3); DataEvent<DataDoc> de1 = new DataEvent<DataDoc>(new DataDoc(1, d1), "1"); DataEvent<DataDoc> de2 = new DataEvent<DataDoc>(new DataDoc(2, d2), "1"); DataEvent<DataDoc> de3 = new DataEvent<DataDoc>(new DataDoc(3, d3), "1"); zoie.consume(Arrays.asList(de1, de2, de3)); zoie.flushEvents(10000); List<?> readerList = zoie.getIndexReaders(); // combine the readers MultiReader reader = new MultiReader(readerList.toArray(new IndexReader[readerList.size()]),false); // do search IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_35, "num", new StandardAnalyzer(Version.LUCENE_35)); Query q = parser.parse("num:abc*"); TopDocs ret = searcher.search(q, 100); System.out.println(ret.totalHits); searcher.close(); reader.close(); zoie.returnIndexReaders((List) readerList); de1 = new DataEvent<DataDoc>(new DataDoc(1), "2"); de2 = new DataEvent<DataDoc>(new DataDoc(2), "2"); de3 = new DataEvent<DataDoc>(new DataDoc(3), "2"); zoie.consume(Arrays.asList(de1, de2, de3)); zoie.flushEventsToMemoryIndex(10000); readerList = zoie.getIndexReaders(); // combine the readers reader = new MultiReader(readerList.toArray(new IndexReader[readerList.size()]),false); // do search searcher = new IndexSearcher(reader); ret = searcher.search(q, 100); System.out.println(ret.totalHits); System.exit(0); } } class DefaultInterpreter implements ZoieIndexableInterpreter<DataDoc> { @Override public ZoieIndexable convertAndInterpret(DataDoc src) { return src; } } class DataDoc implements ZoieIndexable { private long uid; private Document doc; private boolean valid; public DataDoc(long uid, Document doc) { this.uid = uid; this.doc = doc; this.valid = true; } public DataDoc(long uid) { this.uid = uid; this.valid = false; } @Override public long getUID() { return uid; } @Override public boolean isDeleted() { return !valid; } @Override public boolean isSkip() { return false; } @Override public IndexingReq[] buildIndexingReqs() { return new IndexingReq[]{new IndexingReq(doc)}; } @Override public boolean isStorable() { return false; } @Override public byte[] getStoreValue() { return null; } }

Environment

None

Status

Assignee

John Wang

Reporter

Zhuobin He

Labels

None

Components

Fix versions

Affects versions

3.1.0

Priority

Major