[ lucene扩展 ] 自定义Collector实现统计功能



[ lucene扩展 ] 自定义Collector实现统计功能
在search中我们来看看collector是怎么收集结果的!
public void search(Weight weight, Filter filter, Collector collector)
            throws IOException {
 
        // TODO: should we make this
        // threaded...? the Collector could be sync'd?
 
        // always use single thread:
        for (int i = 0; i < subReaders.length; i++) { // 检索每个子索引
            collector.setNextReader(subReaders[i], docBase + docStarts[i]);
            final Scorer scorer = (filter == null) ? weight.scorer(
                    subReaders[i], !collector.acceptsDocsOutOfOrder(), true)
                    : FilteredQuery.getFilteredScorer(subReaders[i],
                            getSimilarity(), weight, weight, filter);//构建打分器
            if (scorer != null) {
                scorer.score(collector);//打分
            }
        }
    }
scorer.score(collector)的过程如下:
public void score(Collector collector) throws IOException {
    collector.setScorer(this);
    int doc;
    while ((doc = nextDoc()) != NO_MORE_DOCS) {
      collector.collect(doc);//搜集结果
    }
  }
collector.collect(doc)的过程如下:
@Override
public void collect(int doc) throws IOException {
  float score = scorer.score();
 
  // This collector cannot handle these scores:
  assert score != Float.NEGATIVE_INFINITY;
  assert !Float.isNaN(score);
 
  totalHits++;
  if (score <= pqTop.score) {
    // 以下的实现使用了优先级队列,如果当前分值小于队列中pqTop.score则直接pass!
    return;
  }
  pqTop.doc = doc + docBase;
  pqTop.score = score;
  pqTop = pq.updateTop();
}
public class GroupCollectorDemo extends Collector {
 
    private GF gf = new GF();// 保存分组统计结果
    private String[] fc;// fieldCache
    private String f;// 统计字段
    String spliter;
    int length;
 
    public void setFc(String[] fc) {
        this.fc = fc;
    }
 
    @Override
    public void setScorer(Scorer scorer) throws IOException {
    }
 
    @Override
    public void setNextReader(IndexReader reader, int docBase)
            throws IOException {
        fc = FieldCache.DEFAULT.getStrings(reader, f);
    }
 
    @Override
    public void collect(int doc) throws IOException {
        // 添加的GroupField中,由GroupField负责统计每个不同值的数目
        gf.addValue(fc[doc]);
    }
 
    @Override
    public boolean acceptsDocsOutOfOrder() {
        return true;
    }
 
    public GF getGroupField() {
        return gf;
    }
 
    public void setSpliter(String spliter) {
        this.spliter = spliter;
    }
 
    public void setLength(int length) {
        this.length = length;
    }
 
    public void setF(String f) {
        this.f = f;
    }
}
 
class GF {
    // 所有可能的分组字段值,排序按每个字段值的文档个数大小排序
    private List<String> values = new ArrayList<String>();
    // 保存字段值和文档个数的对应关系
    private Map<String, Integer> countMap = new HashMap<String, Integer>();
 
    public Map<String, Integer> getCountMap() {
        return countMap;
    }
 
    public void setCountMap(Map<String, Integer> countMap) {
        this.countMap = countMap;
    }
 
    public List<String> getValues() {
        Collections.sort(values, new ValueComparator());
        return values;
    }
 
    public void setValues(List<String> values) {
        this.values = values;
    }
 
    public void addValue(String value) {
        if (value == null || "".equals(value))
            return;
        if (countMap.get(value) == null) {
            countMap.put(value, 1);
            values.add(value);
        } else {
            countMap.put(value, countMap.get(value) + 1);
        }
    }
 
    class ValueComparator implements Comparator<String> {
        public int compare(String value0, String value1) {
            if (countMap.get(value0) > countMap.get(value1)) {
                return -1;
            } else if (countMap.get(value0) < countMap.get(value1)) {
                return 1;
            }
            return 0;
        }
    }
}
Please read full article from [ lucene扩展 ] 自定义Collector实现统计功能

No comments:

Post a Comment

Labels

Algorithm (219) Lucene (130) LeetCode (97) Database (36) Data Structure (33) text mining (28) Solr (27) java (27) Mathematical Algorithm (26) Difficult Algorithm (25) Logic Thinking (23) Puzzles (23) Bit Algorithms (22) Math (21) List (20) Dynamic Programming (19) Linux (19) Tree (18) Machine Learning (15) EPI (11) Queue (11) Smart Algorithm (11) Operating System (9) Java Basic (8) Recursive Algorithm (8) Stack (8) Eclipse (7) Scala (7) Tika (7) J2EE (6) Monitoring (6) Trie (6) Concurrency (5) Geometry Algorithm (5) Greedy Algorithm (5) Mahout (5) MySQL (5) xpost (5) C (4) Interview (4) Vi (4) regular expression (4) to-do (4) C++ (3) Chrome (3) Divide and Conquer (3) Graph Algorithm (3) Permutation (3) Powershell (3) Random (3) Segment Tree (3) UIMA (3) Union-Find (3) Video (3) Virtualization (3) Windows (3) XML (3) Advanced Data Structure (2) Android (2) Bash (2) Classic Algorithm (2) Debugging (2) Design Pattern (2) Google (2) Hadoop (2) Java Collections (2) Markov Chains (2) Probabilities (2) Shell (2) Site (2) Web Development (2) Workplace (2) angularjs (2) .Net (1) Amazon Interview (1) Android Studio (1) Array (1) Boilerpipe (1) Book Notes (1) ChromeOS (1) Chromebook (1) Codility (1) Desgin (1) Design (1) Divide and Conqure (1) GAE (1) Google Interview (1) Great Stuff (1) Hash (1) High Tech Companies (1) Improving (1) LifeTips (1) Maven (1) Network (1) Performance (1) Programming (1) Resources (1) Sampling (1) Sed (1) Smart Thinking (1) Sort (1) Spark (1) Stanford NLP (1) System Design (1) Trove (1) VIP (1) tools (1)

Popular Posts