To use this index you will need some Lucene extra classes that you can download from http://sourceforge.net/projects/luceneqe/ The index is a typical Lucene index where the word position information has beed removed to avoid rebulding the original documents. This information loss is not a very important problem to use this index in evaluation tasks since the most important data has been preserved. If you want to know what information contains this index you can use Lucene index inspector: Luke http://www.getopt.org/luke/ The document field name where the document contents (title and body) has been stored is "text". We are indexed together title and document body Installation and use -------------------- You will need the following libraries (available here): Lucene-core-2.3.0.jar Lucene-highlighter-2.3.0.jar Lucene-memory-2.3.0.jar snowball-1.0.jar wn.jar (WordNet) And our code: LuceneQE.tar.gz available here: http://sourceforge.net/projects/luceneqe/ Now you can try to run these examples for the different options that lucene implements: BM25 simple query: http://grasia.fdi.ucm.es/jose/query-expansion/BM25.java package experiments; import java.io.File; import java.util.Date; import modelsIR.BM25MultiTermQuery; import modelsIR.SimilarityBM25; import org.apache.lucene.analysis.snowball.SnowballAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.TopDocs; import utils.BM25Parameters; import utils.CollectionStats; public class BM25 { public static void main(String[] args) throws Exception { String index = args[0]; String query = args[1]; String field = args[2]; CollectionStats.setAvgDocumentLength(598.0f);//Avg. length for robust clef collection BM25Parameters.setB(0.67f); BM25Parameters.setK1(3.8f); long start = new Date().getTime(); File indexDir = new File(index);// Here we need the index directory if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } /* * Loading Lucene index */ SnowballAnalyzer analyzer = new SnowballAnalyzer("English"); IndexSearcher is = new IndexSearcher(index); System.out.println("MaxDoc: " + is.maxDoc()); is.setSimilarity(new SimilarityBM25()); IndexReader ir = is.getIndexReader(); QueryParser parser = new QueryParser(field, analyzer); // Searching String[] fields = { field };//"text" is the field name for this index Query queryBM25 = new BM25MultiTermQuery(parser.parse(query), fields, query); TopDocCollector collector = new TopDocCollector(1000); is.search(queryBM25, collector); TopDocs results = collector.topDocs(); ScoreDoc[] doc = results.scoreDocs; for (int n = 0; n < doc.length; n++) { int d = doc[n].doc; float s = doc[n].score; Document document = ir.document(d); System.out.println(n + " " + document.get("id")); // System.out.println(is.explain(queryBM25, d).toString()); System.out.println(d + " " + s); } is.close(); ir.close(); long end = new Date().getTime(); System.out.println("Time: " + (end - start) / 1000 + " seconds"); } } Query Expansion Example using KL Divergence: http://grasia.fdi.ucm.es/jose/query-expansion/KLDQueryExpansion.java package experiments; import java.io.File; import java.util.Date; import java.util.List; import modelsIR.BM25MultiTermQuery; import modelsIR.SimilarityBM25; import org.apache.lucene.analysis.snowball.SnowballAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.TopDocs; import queryExpansion.modelsQE.QueryExpansionModel; import queryExpansion.modelsQE.QueryExpansionModelFactory; import utils.BM25Parameters; import utils.CollectionStats; import utils.ExpansionParameters; public class KLDQueryExpansion { public static void main(String[] args) throws Exception { String index = args[0]; String query = args[1]; String field = args[2]; String fields[] = {field}; CollectionStats.setAvgDocumentLength(598.0f); CollectionStats.setFreqTokenCollection(34906437.0); CollectionStats.setTotalDocumentsCollection(110245.0); CollectionStats.setFields(fields); ExpansionParameters.setBetaRocchio(0.3); ExpansionParameters.setIDFthreshold(2.5); ExpansionParameters.setNumExpansionTerms(40); ExpansionParameters.setTopNdocuments(1); BM25Parameters.setB(0.67f); BM25Parameters.setK1(3.8f); long start = new Date().getTime(); File indexDir = new File(index); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } //WordPreprocessor.loadStopWordsList("MY-STOP-WORD-LIST"); /* * Loading Lucene index */ SnowballAnalyzer analyzer = new SnowballAnalyzer("English"); IndexSearcher is = new IndexSearcher(index); System.out.println("MaxDoc: " + is.maxDoc()); is.setSimilarity(new SimilarityBM25()); IndexReader ir = is.getIndexReader(); QueryParser parser = new QueryParser(field, analyzer); // Query Expansion Query queryBM25 = new BM25MultiTermQuery(parser.parse(query), fields, query); QueryExpansionModelFactory .selectQueryExpansionModel(QueryExpansionModelFactory.KLD); QueryExpansionModel thes = QueryExpansionModelFactory .getQueryExpansionModel(ir, is, query, is.search(queryBM25),fields); List candidateList = thes.getPlainList(); String queryTotalExpanded = candidateList.toString(); System.out.println(queryTotalExpanded); queryTotalExpanded = queryTotalExpanded.replaceAll("\\[", ""); queryTotalExpanded = queryTotalExpanded.replaceAll("\\]", ""); queryTotalExpanded = queryTotalExpanded.replaceAll("\\,", ""); System.out.println("Expansion: " + queryTotalExpanded); // Searching Query queryBM25Expanded = new BM25MultiTermQuery(parser .parse(queryTotalExpandida), fields, queryTotalExpandida); TopDocCollector collector = new TopDocCollector(1000); is.search(queryBM25Expanded, collector); TopDocs results = collector.topDocs(); ScoreDoc[] doc = results.scoreDocs; for (int n = 0; n < doc.length; n++) { int d = doc[n].doc; float s = doc[n].score; Document document = ir.document(d); System.out.println(n + " " + document.get("id")); // System.out.println(is.explain(queryBM25, d).toString()); System.out.println(d + " " + s); } // System.out.println(is.explain(queryBM25Expanded, // doc[0].doc).toString()); is.close(); ir.close(); long end = new Date().getTime(); System.out.println("Time: " + (end - start) / 1000 + " segundos"); } } How to use our code You can use our code in two different ways: Simple BM25 retrieval and BM25 + Query Expansion Retrieval. Simple BM25 retrieval and query expansion modules are very easy to use, since we have implemented this functions using the same schema defined by Lucene. Our BM25 implementation only use OR operator. Robust WSD CLEF track: indexing With this Robust WSD CLEF index, you can use our code to implement different experiments. The following java classes has been used for us to index Robust WSD CLEF collection, and maybe could be useful for you if you want to work with this collection to implement differents semantic query expansion experiments. You will need also this library to work with bz2 files: http://grasia.fdi.ucm.es/jose/query-expansion/bzip2.jar If you need more information about this code and the algorithms implemented you can visit: http://grasia.fdi.ucm.es/jose/query-expansion/