[ lucene扩展 ] 自定义Collector实现统计功能
Please read full article from [ lucene扩展 ] 自定义Collector实现统计功能
在search中我们来看看collector是怎么收集结果的!
public void search(Weight weight, Filter filter, Collector collector) throws IOException { // TODO: should we make this // threaded...? the Collector could be sync'd? // always use single thread: for ( int i = 0 ; i < subReaders.length; i++) { // 检索每个子索引 collector.setNextReader(subReaders[i], docBase + docStarts[i]); final Scorer scorer = (filter == null ) ? weight.scorer( subReaders[i], !collector.acceptsDocsOutOfOrder(), true ) : FilteredQuery.getFilteredScorer(subReaders[i], getSimilarity(), weight, weight, filter); //构建打分器 if (scorer != null ) { scorer.score(collector); //打分 } } } |
scorer.score(collector)的过程如下:
public void score(Collector collector) throws IOException { collector.setScorer( this ); int doc; while ((doc = nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); //搜集结果 } } |
collector.collect(doc)的过程如下:
@Override public void collect( int doc) throws IOException { float score = scorer.score(); // This collector cannot handle these scores: assert score != Float.NEGATIVE_INFINITY; assert !Float.isNaN(score); totalHits++; if (score <= pqTop.score) { // 以下的实现使用了优先级队列,如果当前分值小于队列中pqTop.score则直接pass! return ; } pqTop.doc = doc + docBase; pqTop.score = score; pqTop = pq.updateTop(); } |
public
class
GroupCollectorDemo
extends
Collector {
private
GF gf =
new
GF();
// 保存分组统计结果
private
String[] fc;
// fieldCache
private
String f;
// 统计字段
String spliter;
int
length;
public
void
setFc(String[] fc) {
this
.fc = fc;
}
@Override
public
void
setScorer(Scorer scorer)
throws
IOException {
}
@Override
public
void
setNextReader(IndexReader reader,
int
docBase)
throws
IOException {
fc = FieldCache.DEFAULT.getStrings(reader, f);
}
@Override
public
void
collect(
int
doc)
throws
IOException {
// 添加的GroupField中,由GroupField负责统计每个不同值的数目
gf.addValue(fc[doc]);
}
@Override
public
boolean
acceptsDocsOutOfOrder() {
return
true
;
}
public
GF getGroupField() {
return
gf;
}
public
void
setSpliter(String spliter) {
this
.spliter = spliter;
}
public
void
setLength(
int
length) {
this
.length = length;
}
public
void
setF(String f) {
this
.f = f;
}
}
class
GF {
// 所有可能的分组字段值,排序按每个字段值的文档个数大小排序
private
List<String> values =
new
ArrayList<String>();
// 保存字段值和文档个数的对应关系
private
Map<String, Integer> countMap =
new
HashMap<String, Integer>();
public
Map<String, Integer> getCountMap() {
return
countMap;
}
public
void
setCountMap(Map<String, Integer> countMap) {
this
.countMap = countMap;
}
public
List<String> getValues() {
Collections.sort(values,
new
ValueComparator());
return
values;
}
public
void
setValues(List<String> values) {
this
.values = values;
}
public
void
addValue(String value) {
if
(value ==
null
||
""
.equals(value))
return
;
if
(countMap.get(value) ==
null
) {
countMap.put(value,
1
);
values.add(value);
}
else
{
countMap.put(value, countMap.get(value) +
1
);
}
}
class
ValueComparator
implements
Comparator<String> {
public
int
compare(String value0, String value1) {
if
(countMap.get(value0) > countMap.get(value1)) {
return
-
1
;
}
else
if
(countMap.get(value0) < countMap.get(value1)) {
return
1
;
}
return
0
;
}
}
}
No comments:
Post a Comment