[ lucene扩展 ] 自定义Collector实现统计功能
Please read full article from [ lucene扩展 ] 自定义Collector实现统计功能
在search中我们来看看collector是怎么收集结果的!
public void search(Weight weight, Filter filter, Collector collector) throws IOException { // TODO: should we make this // threaded...? the Collector could be sync'd? // always use single thread: for (int i = 0; i < subReaders.length; i++) { // 检索每个子索引 collector.setNextReader(subReaders[i], docBase + docStarts[i]); final Scorer scorer = (filter == null) ? weight.scorer( subReaders[i], !collector.acceptsDocsOutOfOrder(), true) : FilteredQuery.getFilteredScorer(subReaders[i], getSimilarity(), weight, weight, filter);//构建打分器 if (scorer != null) { scorer.score(collector);//打分 } } } |
scorer.score(collector)的过程如下:
public void score(Collector collector) throws IOException { collector.setScorer(this); int doc; while ((doc = nextDoc()) != NO_MORE_DOCS) { collector.collect(doc);//搜集结果 } } |
collector.collect(doc)的过程如下:
@Overridepublic void collect(int doc) throws IOException { float score = scorer.score(); // This collector cannot handle these scores: assert score != Float.NEGATIVE_INFINITY; assert !Float.isNaN(score); totalHits++; if (score <= pqTop.score) { // 以下的实现使用了优先级队列,如果当前分值小于队列中pqTop.score则直接pass! return; } pqTop.doc = doc + docBase; pqTop.score = score; pqTop = pq.updateTop();} |
public class GroupCollectorDemo extends Collector { private GF gf = new GF();// 保存分组统计结果 private String[] fc;// fieldCache private String f;// 统计字段 String spliter; int length; public void setFc(String[] fc) { this.fc = fc; } @Override public void setScorer(Scorer scorer) throws IOException { } @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { fc = FieldCache.DEFAULT.getStrings(reader, f); } @Override public void collect(int doc) throws IOException { // 添加的GroupField中,由GroupField负责统计每个不同值的数目 gf.addValue(fc[doc]); } @Override public boolean acceptsDocsOutOfOrder() { return true; } public GF getGroupField() { return gf; } public void setSpliter(String spliter) { this.spliter = spliter; } public void setLength(int length) { this.length = length; } public void setF(String f) { this.f = f; }}class GF { // 所有可能的分组字段值,排序按每个字段值的文档个数大小排序 private List<String> values = new ArrayList<String>(); // 保存字段值和文档个数的对应关系 private Map<String, Integer> countMap = new HashMap<String, Integer>(); public Map<String, Integer> getCountMap() { return countMap; } public void setCountMap(Map<String, Integer> countMap) { this.countMap = countMap; } public List<String> getValues() { Collections.sort(values, new ValueComparator()); return values; } public void setValues(List<String> values) { this.values = values; } public void addValue(String value) { if (value == null || "".equals(value)) return; if (countMap.get(value) == null) { countMap.put(value, 1); values.add(value); } else { countMap.put(value, countMap.get(value) + 1); } } class ValueComparator implements Comparator<String> { public int compare(String value0, String value1) { if (countMap.get(value0) > countMap.get(value1)) { return -1; } else if (countMap.get(value0) < countMap.get(value1)) { return 1; } return 0; } }}
No comments:
Post a Comment