Advanced Usage
This section covers advanced Lucene index features and patterns.
Native Lucene Queries
For complex queries beyond the query parser syntax, use native Lucene Query objects.
Term Queries
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
Query query = new TermQuery(new Term("category", "electronics"));
List<Product> results = luceneIndex.query(query);
Boolean Queries
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("category", "electronics")), BooleanClause.Occur.MUST);
builder.add(new TermQuery(new Term("brand", "premium")), BooleanClause.Occur.SHOULD);
builder.add(new TermQuery(new Term("status", "discontinued")), BooleanClause.Occur.MUST_NOT);
Query query = builder.build();
List<Product> results = luceneIndex.query(query);
Numeric Range Queries
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.DoublePoint;
// Integer range
Query ageQuery = IntPoint.newRangeQuery("age", 18, 65);
// Long range
Query idQuery = LongPoint.newRangeQuery("id", 1000L, 2000L);
// Float range
Query scoreQuery = FloatPoint.newRangeQuery("score", 0.5f, 1.0f);
// Double range
Query priceQuery = DoublePoint.newRangeQuery("price", 10.0, 100.0);
// Exact match
Query exactPrice = DoublePoint.newExactQuery("price", 29.99);
Prefix Queries
import org.apache.lucene.search.PrefixQuery;
Query prefixQuery = new PrefixQuery(new Term("name", "wire"));
List<Product> results = luceneIndex.query(prefixQuery);
Wildcard Queries
import org.apache.lucene.search.WildcardQuery;
// * matches any sequence of characters
Query wildcardQuery = new WildcardQuery(new Term("name", "wire*"));
// ? matches any single character
Query singleCharQuery = new WildcardQuery(new Term("sku", "ABC-???-XYZ"));
Fuzzy Queries
import org.apache.lucene.search.FuzzyQuery;
// Find terms similar to "bluetooth" (handles typos)
Query fuzzyQuery = new FuzzyQuery(new Term("name", "bluetoth"));
List<Product> results = luceneIndex.query(fuzzyQuery);
Phrase Queries
import org.apache.lucene.search.PhraseQuery;
// Exact phrase match
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("content", "machine"));
builder.add(new Term("content", "learning"));
Query phraseQuery = builder.build();
// With slop (allows words in between)
builder.setSlop(2); // Allow up to 2 words between terms
Relevance Scoring
Access relevance scores to rank or filter results.
// Collect results with scores
List<ScoredResult<Product>> scoredResults = new ArrayList<>();
luceneIndex.query("name:wireless", (entityId, product, score) -> {
scoredResults.add(new ScoredResult<>(product, score));
});
// Sort by score descending
scoredResults.sort((a, b) -> Float.compare(b.score(), a.score()));
// Filter by minimum score
List<Product> highConfidence = scoredResults.stream()
.filter(r -> r.score() > 1.0f)
.map(ScoredResult::product)
.toList();
public record ScoredResult<E>(E product, float score) {}
Boosting
Boost certain fields to increase their importance in scoring.
import org.apache.lucene.search.BoostQuery;
// Title matches are more important than content matches
Query titleQuery = new TermQuery(new Term("title", "python"));
Query contentQuery = new TermQuery(new Term("content", "python"));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new BoostQuery(titleQuery, 2.0f), BooleanClause.Occur.SHOULD); // 2x boost
builder.add(contentQuery, BooleanClause.Occur.SHOULD);
List<Article> results = luceneIndex.query(builder.build());
Pagination
Implement pagination for large result sets.
public class SearchResult<E>
{
private final List<E> items;
private final int totalHits;
private final int page;
private final int pageSize;
// ...
}
public SearchResult<Product> search(String queryText, int page, int pageSize)
{
int offset = page * pageSize;
List<Product> allResults = luceneIndex.query(queryText, offset + pageSize);
// Skip to the correct page
List<Product> pageResults = allResults.stream()
.skip(offset)
.limit(pageSize)
.toList();
return new SearchResult<>(pageResults, allResults.size(), page, pageSize);
}
For very large result sets, consider using Lucene’s SearchAfter for deep pagination.
|
Bulk Operations
For importing large amounts of data, disable auto-commit for better performance.
// Create context with auto-commit disabled
LuceneContext<Product> context = LuceneContext.builder()
.directoryCreator(DirectoryCreator.MMap(indexPath))
.documentPopulator(new ProductDocumentPopulator())
.autoCommit(false)
.build();
GigaMap<Product> products = GigaMap.New();
LuceneIndex<Product> luceneIndex = products.index().register(LuceneIndex.Category(context));
// Bulk import
int batchSize = 1000;
int count = 0;
for (Product product : productImport)
{
products.add(product);
count++;
// Commit periodically
if (count % batchSize == 0)
{
luceneIndex.commit();
}
}
// Final commit
luceneIndex.commit();
Custom Analyzers
Create custom analyzers for specialized text processing.
Keyword Analyzer
For fields that should not be tokenized.
import org.apache.lucene.analysis.core.KeywordAnalyzer;
public class KeywordAnalyzerCreator extends AnalyzerCreator
{
@Override
public Analyzer create()
{
return new KeywordAnalyzer();
}
}
Whitespace Analyzer
Tokenizes only on whitespace, preserving case and punctuation.
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
public class WhitespaceAnalyzerCreator extends AnalyzerCreator
{
@Override
public Analyzer create()
{
return new WhitespaceAnalyzer();
}
}
Per-Field Analyzer
Use different analyzers for different fields.
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
public class CustomAnalyzerCreator extends AnalyzerCreator
{
@Override
public Analyzer create()
{
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
fieldAnalyzers.put("sku", new KeywordAnalyzer());
fieldAnalyzers.put("email", new KeywordAnalyzer());
return new PerFieldAnalyzerWrapper(new StandardAnalyzer(), fieldAnalyzers);
}
}
Highlighting
Extract matching snippets from search results.
import org.apache.lucene.search.highlight.*;
public String highlight(String text, String queryText) throws Exception
{
QueryParser parser = new QueryParser("content", new StandardAnalyzer());
Query query = parser.parse(queryText);
Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter("<b>", "</b>"),
new QueryScorer(query)
);
TokenStream tokenStream = new StandardAnalyzer()
.tokenStream("content", new StringReader(text));
return highlighter.getBestFragment(tokenStream, text);
}
// Usage
String snippet = highlight(article.getContent(), "machine learning");
// Returns: "Introduction to <b>machine</b> <b>learning</b> algorithms..."
Highlighting requires the lucene-highlighter dependency.
|
Faceted Search
Count results by category using Lucene’s faceting.
// Requires lucene-facet dependency and additional setup
// See Apache Lucene faceting documentation for details
For simpler faceting, combine with GigaMap’s bitmap indices:
// Get Lucene search results
List<Product> searchResults = luceneIndex.query("name:wireless");
Set<Long> resultIds = searchResults.stream()
.map(Product::getId)
.collect(Collectors.toSet());
// Count by category using bitmap
Map<String, Long> categoryCounts = products.query(Query.all())
.filter(p -> resultIds.contains(p.getId()))
.collect(Collectors.groupingBy(Product::getCategory, Collectors.counting()));
Index Lifecycle
Performance Tips
-
Use StringField for exact matches - Faster than TextField for IDs, categories, status values.
-
Disable auto-commit for bulk imports - Commit periodically instead of after each operation.
-
Use MMapDirectory for large indexes - Better performance than embedded storage.
-
Limit result sets - Always specify a reasonable limit to avoid loading too many results.
-
Avoid leading wildcards - Queries like
*phoneare slow. Use edge n-grams instead. -
Pre-compute commonly filtered fields - Store as StringField for fast filtering.