Lucene的分组(Grouping/GroupBy)功能 « 克己服人,礼智谦让!
topN分组的主要原理:
两次检索
1) 执行第一次检索,应用FirstPassGroupingCollector,对组进行排序并截取(offset, offset+topn)范围内的组
2) 执行第二次检索,应用SecondPassGroupingCollector,为每个组提取前n条记录
为提高第二次检索的效率,引入CacheCollector,执行第一次检索时缓存匹配的记录,并在第二次检索读取缓存的记录
为记录总的组的数量,另外还引入了AllGroupsCollector
注意,如果需要对Lucene的的score进行修正,则需要重载TermFirstPassGroupingCollector和 TermSecondPassGroupingCollector
topN分组的主要原理:
两次检索
1) 执行第一次检索,应用FirstPassGroupingCollector,对组进行排序并截取(offset, offset+topn)范围内的组
2) 执行第二次检索,应用SecondPassGroupingCollector,为每个组提取前n条记录
为提高第二次检索的效率,引入CacheCollector,执行第一次检索时缓存匹配的记录,并在第二次检索读取缓存的记录
为记录总的组的数量,另外还引入了AllGroupsCollector
注意,如果需要对Lucene的的score进行修正,则需要重载TermFirstPassGroupingCollector和 TermSecondPassGroupingCollector
- public void groupBy(IndexSearcher searcher, Query query, Sort groupSort) throws IOException {
- int topNGroups = 10; // 每页需要多少个组
- int groupOffset = 0; // 起始的组
- boolean fillFields = true;
- Sort docSort = groupSort; // groupSort用于对组进行排序,docSort用于对组内记录进行排序,多数情况下两者是相同的,但也可不同
- // Sort docSort = new Sort(new SortField[] { new SortField("page", SortField.INT, true) });
- int docOffset = 0; // 用于组内分页,起始的记录
- int docsPerGroup = 2;// 每组返回多少条结果
- boolean requiredTotalGroupCount = true; // 是否需要计算总的组的数量
- // 如果需要对Lucene的score进行修正,则需要重载TermFirstPassGroupingCollector
- TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector(searcher.getIndexReader(), "author", groupSort, groupOffset + topNGroups);
- boolean cacheScores = true;
- double maxCacheRAMMB = 16.0;
- CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
- searcher.search(query, cachedCollector);
- Collection<searchgroup<string>> topGroups = c1.getTopGroups(groupOffset, fillFields);
- if (topGroups == null) {
- // No groups matched
- return;
- }
- HitCollector secondPassCollector = null;
- boolean getScores = true;
- boolean getMaxScores = true;
- // 如果需要对Lucene的score进行修正,则需要重载TermSecondPassGroupingCollector
- TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector(searcher.getIndexReader(), "author", topGroups, groupSort, docSort, docOffset + docsPerGroup, getScores, getMaxScores, fillFields);
- // Optionally compute total group count
- TermAllGroupsCollector allGroupsCollector = null;
- if (requiredTotalGroupCount) {
- allGroupsCollector = new TermAllGroupsCollector(searcher.getIndexReader(), "author");
- secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
- } else {
- secondPassCollector = c2;
- }
- if (cachedCollector.isCached()) {
- // Cache fit within maxCacheRAMMB, so we can replay it:
- cachedCollector.replay(secondPassCollector);
- } else {
- // Cache was too large; must re-execute query:
- searcher.search(query, secondPassCollector);
- }
- int totalGroupCount = -1; // 所有组的数量
- int totalHitCount = -1; // 所有满足条件的记录数
- int totalGroupedHitCount = -1; // 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的)
- if (requiredTotalGroupCount) {
- totalGroupCount = allGroupsCollector.getGroupCount();
- }
- System.out.println("groupCount: " + totalGroupCount);
- TopGroups<string> groupsResult = c2.getTopGroups(docOffset);
- totalHitCount = groupsResult.totalHitCount;
- totalGroupedHitCount = groupsResult.totalGroupedHitCount;
- System.out.println("groupsResult.totalHitCount:" + totalHitCount);
- System.out.println("groupsResult.totalGroupedHitCount:" + totalGroupedHitCount);
- int groupIdx = 0;
- // 迭代组
- for (GroupDocs<string> groupDocs : groupsResult.groups) {
- groupIdx++;
- System.out.println("group[" + groupIdx + "]:" + groupDocs.groupValue); // 组的标识
- System.out.println("group[" + groupIdx + "]:" + groupDocs.totalHits); // 组内的记录数
- int docIdx = 0;
- // 迭代组内的记录
- for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
- docIdx++;
- System.out.println("group[" + groupIdx + "][" + docIdx + "]:" + scoreDoc.doc + "/" + scoreDoc.score);
- Document doc = searcher.doc(scoreDoc.doc);
- System.out.println("group[" + groupIdx + "][" + docIdx + "]:" + doc);
- }
- }
- }
No comments:
Post a Comment