Sort a Lucene index by a StoredField or a function of docID

2 weeks ago 17
ARTICLE AD BOX

I have a lot of static (unchanging) Lucene indexes where each document has been given an integer StoredField called "fileId" (and each index only has 1 segment). I now want to modify these indexes so that the documents are sorted by fileId.

I found two unit tests for index sorting here but if I use new Sort(new SortField("fileId", INT)) like in the tests then when SortingCodecReader.wrap() is called I get "IllegalStateException: unexpected docvalues type NONE for field 'fileId' (expected=NUMERIC). Re-index with correct docvalues type."

I think that's because "fileId" wasn't a DocValue field. Fortunately I can easily get the fileId from the docID, but unfortunately SortField doesn't provide a constructor to sort by something based on docID. So I made DocSortField that extends SortField. It seems like it should work but for some reason after the code runs the index hasn't been sorted correctly (in fact the order doesn't appear to have changed at all). There's no error message so I have no idea where it's going wrong. I've tried adding a call to forceMerge(1) to enforce the sorting but it doesn't help.

My version of Lucene is 9.11.1.

import org.apache.lucene.index.*; import org.apache.lucene.search.Sort; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import static org.apache.lucene.index.PostingsEnum.POSITIONS; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; public class LuceneIndexSorter { public static void main(String[] args) throws IOException { File oldDir = new File("path/to/index"); if (areDocumentsInOrder(oldDir)) { throw new RuntimeException("Documents already in order"); } File newDir = new File(oldDir.getParent(), oldDir.getName() + " 2"); Sort indexSort = new Sort(new DocSortField("fileId", LuceneIndexSorter::getFileId)); // The rest comes from https://github.com/apache/lucene-solr/blob/a7bdc6893e21954ed9f6d8bce256a4a9c917310b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java IndexWriterConfig indexWriterConfig = new IndexWriterConfig().setIndexSort(indexSort); Directory newDirectory = FSDirectory.open(newDir.toPath()); Directory oldDirectory = FSDirectory.open(oldDir.toPath()); IndexWriter indexWriter = new IndexWriter(newDirectory, indexWriterConfig); try (DirectoryReader indexReader = DirectoryReader.open(oldDirectory)) { List<CodecReader> wrappedCodecReaders = new ArrayList<>(); for (LeafReaderContext ctx : indexReader.leaves()) { CodecReader wrap = SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort); assert wrap.toString().startsWith("SortingCodecReader("); wrappedCodecReaders.add(wrap); } indexWriter.addIndexes(wrappedCodecReaders.toArray(new CodecReader[0])); indexWriter.commit(); // This line is needed to prevent "IndexNotFoundException: no segments* file found in MMapDirectory@...", although it's not used in the unit tests } // Check if it worked if (!areDocumentsInOrder(newDir)) { throw new RuntimeException("Documents weren't correctly sorted"); // Program ends up here with no other output } } private static boolean areDocumentsInOrder(File dir) throws IOException { FSDirectory directory = FSDirectory.open(dir.toPath()); DirectoryReader indexReader = DirectoryReader.open(directory); try (LeafReader leafReader = indexReader.leaves().get(0).reader()) { PostingsEnum postings = leafReader.postings(new Term("tokens", "a"), POSITIONS); int docId = postings.nextDoc(); List<Integer> fileIds = new ArrayList<>(); while (docId != NO_MORE_DOCS) { fileIds.add(getFileId(docId)); if (fileIds.size() >= 2 && fileIds.get(fileIds.size() - 2) > (fileIds.get(fileIds.size() - 1))) { return false; } docId = postings.nextDoc(); } return true; } } private static int getFileId(int docId) { ... } } import org.apache.lucene.index.IndexSorter; import org.apache.lucene.search.SortField; import java.util.function.Function; public class DocSortField extends SortField { private Function<Integer, Integer> docIdToValue; public DocSortField(String field, Function<Integer, Integer> docIdToValue) { super(field, Type.DOC); this.docIdToValue = docIdToValue; } @Override public IndexSorter getIndexSorter() { return new DocIdSorter(Provider.NAME, docIdToValue); } } import org.apache.lucene.index.IndexSorter; import org.apache.lucene.index.LeafReader; import java.util.List; import java.util.function.Function; public class DocIdSorter implements IndexSorter { private final String providerName; private final Function<Integer, Integer> docIdToValue; public DocIdSorter(String providerName, Function<Integer, Integer> docIdToValue) { this.providerName = providerName; this.docIdToValue = docIdToValue; } @Override public ComparableProvider[] getComparableProviders(List<? extends LeafReader> readers) { ComparableProvider[] providers = new ComparableProvider[readers.size()]; for (int readerIndex = 0; readerIndex < readers.size(); readerIndex++) { providers[readerIndex] = docIdToValue::apply; } return providers; } @Override public DocComparator getDocComparator(LeafReader reader, int maxDoc) { return (docId1, docId2) -> Integer.compare(docIdToValue.apply(docId1), docIdToValue.apply(docId2)); } @Override public String getProviderName() { return providerName; } }
Read Entire Article