com.flaptor.indextank.index.rti.inverted.InvertedIndex.java Source code

Java tutorial

Introduction

Here is the source code for com.flaptor.indextank.index.rti.inverted.InvertedIndex.java

Source

/*
 * Copyright (c) 2011 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.flaptor.indextank.index.rti.inverted;

import java.util.BitSet;
import java.util.Iterator;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;

import com.flaptor.indextank.Indexer;
import com.flaptor.indextank.index.DocId;
import com.flaptor.indextank.index.Document;
import com.flaptor.indextank.index.QueryMatcher;
import com.flaptor.indextank.index.ScoredMatch;
import com.flaptor.indextank.index.TopMatches;
import com.flaptor.indextank.index.scorer.FacetingManager;
import com.flaptor.indextank.index.scorer.Scorer;
import com.flaptor.indextank.index.term.DocTermMatch;
import com.flaptor.indextank.index.term.TermMatcher;
import com.flaptor.indextank.index.term.query.RawMatch;
import com.flaptor.indextank.index.term.query.TermBasedQueryMatcher;
import com.flaptor.indextank.query.AToken;
import com.flaptor.indextank.query.IndexEngineParser;
import com.flaptor.indextank.query.Query;
import com.flaptor.indextank.util.AbstractSkippableIterable;
import com.flaptor.indextank.util.AbstractSkippableIterator;
import com.flaptor.indextank.util.SkippableIterable;
import com.flaptor.indextank.util.SkippableIterator;
import com.flaptor.indextank.util.Skippables;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.MapMaker;
import com.google.common.collect.Maps;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;

public class InvertedIndex implements Indexer, QueryMatcher, TermMatcher {

    private final int maxDocCount;
    private final DocId[] docids;
    private final BitSet internalDeletes;
    private final AtomicInteger docCount;

    private final ConcurrentMap<DocId, Integer> docidsIndexes = new MapMaker().makeMap();
    private final ConcurrentNavigableMap<Key, DocTermMatchList> invertedIndex = new ConcurrentSkipListMap<Key, DocTermMatchList>();
    private final ConcurrentMap<DocId, DocId> deletes = new MapMaker().makeMap();
    //private final ConcurrentHashMap<DocId, DocId> deletes = Maps.newConcurrentHashMap();
    private final QueryMatcher matcher;
    private final IndexEngineParser parser;

    public InvertedIndex(Scorer scorer, IndexEngineParser parser, int maxDocCount,
            FacetingManager facetingManager) {
        Preconditions.checkArgument(maxDocCount > 0);
        this.maxDocCount = maxDocCount;
        this.docids = new DocId[maxDocCount];
        this.internalDeletes = new BitSet(maxDocCount);
        this.docCount = new AtomicInteger(0);
        this.matcher = new TermBasedQueryMatcher(scorer, this, facetingManager);
        this.parser = parser;
    }

    public void add(String sdocid, final Document document) {
        int idx = docCount.getAndIncrement();
        if (idx < maxDocCount) {
            DocId docid = new DocId(sdocid);
            docids[idx] = docid;
            Integer oldIdx = docidsIndexes.put(docid, idx);
            if (oldIdx != null) {
                internalDel(oldIdx);
            }
            internalAdd(idx, document);
        } else {
            throw new IllegalStateException(
                    "MaxDocCount (" + maxDocCount + ") reached. Cannot add more documents.");
        }
    }

    public void del(String sdocid) {
        DocId docid = new DocId(sdocid);
        Integer idx = docidsIndexes.get(docid);
        if (idx != null) {
            internalDel(idx);
        } else {
            deletes.put(docid, docid);
        }
    }

    private void internalAdd(int idx, final Document document) {
        for (String field : document.getFieldNames()) {
            Iterator<AToken> tokens = parser.parseDocumentField(field, document.getField(field));
            SortedSetMultimap<String, Integer> termPositions = TreeMultimap.create();
            int tokenCount = 0;
            while (tokens.hasNext()) {
                tokenCount++;
                AToken token = tokens.next();
                termPositions.put(token.getText(), token.getPosition());
            }

            for (String term : termPositions.keySet()) {
                Key key = new Key(field, term);
                SortedSet<Integer> positionsSet = termPositions.get(term);
                int[] positions = new int[positionsSet.size()];
                int p = 0;
                for (Integer i : positionsSet) {
                    positions[p++] = i;
                }
                DocTermMatchList original = invertedIndex.putIfAbsent(key,
                        new DocTermMatchList(idx, positions, tokenCount));
                if (original != null) {
                    original.add(idx, positions, tokenCount);
                }
            }
        }
    }

    private void internalDel(int idx) {
        internalDeletes.set(idx);
    }

    public SkippableIterable<DocTermMatch> getMatches(String field, String term) {
        DocTermMatchList docList = invertedIndex.get(new Key(field, term));
        if (docList == null) {
            return Skippables.emptyIterable();
        } else {
            return Skippables.filter(docList, notDeletedPredicate());
        }
    }

    @Override
    public NavigableMap<String, SkippableIterable<DocTermMatch>> getMatches(String field, String termFrom,
            String termTo) {
        Key leftBoundary = new Key(field, termFrom);
        Key rightBoundary = new Key(field, termTo);

        ConcurrentNavigableMap<Key, DocTermMatchList> range = invertedIndex.subMap(leftBoundary, rightBoundary);

        NavigableMap<String, SkippableIterable<DocTermMatch>> result = new TreeMap<String, SkippableIterable<DocTermMatch>>();

        int numberOfTerms = 0;
        for (Entry<Key, DocTermMatchList> entry : range.entrySet()) {
            result.put(entry.getKey().term, Skippables.filter(entry.getValue(), notDeletedPredicate()));
            numberOfTerms++;
            if (numberOfTerms >= 1000) {
                break;
            }
        }

        return result;
    }

    private Predicate<DocTermMatch> notDeletedPredicate() {
        return new Predicate<DocTermMatch>() {
            @Override
            public boolean apply(DocTermMatch item) {
                return !internalDeletes.get(item.getRawId());
            }
        };
    }

    public boolean hasChanges(DocId docid) {
        return docidsIndexes.containsKey(docid) || deletes.containsKey(docid);
    }

    @Override
    public Iterable<ScoredMatch> decode(Iterable<RawMatch> rawMatches, final double boostedNorm) {
        return Iterables.transform(rawMatches, new Function<RawMatch, ScoredMatch>() {
            @Override
            public ScoredMatch apply(RawMatch rawMatch) {
                //System.out.println("RESULT: "+rawMatch.getNormalizedScore());
                return new ScoredMatch(rawMatch.getBoostedScore() / boostedNorm, docids[rawMatch.getRawId()]);
            }
        });
    }

    /* QueryMatcher interface - delegates in internal matcher instance */

    public TopMatches findMatches(Query query, int limit, int scoringFunctionIndex) throws InterruptedException {
        return matcher.findMatches(query, limit, scoringFunctionIndex);
    }

    public TopMatches findMatches(Query query, Predicate<DocId> idFilter, int limit, int scoringFunctionIndex)
            throws InterruptedException {
        return matcher.findMatches(query, idFilter, limit, scoringFunctionIndex);
    }

    @Override
    public SkippableIterable<Integer> getAllDocs() {
        return new AbstractSkippableIterable<Integer>() {
            @Override
            public SkippableIterator<Integer> iterator() {
                return new AbstractSkippableIterator<Integer>() {
                    int current = -1;

                    @Override
                    public void skipTo(int i) {
                        current = i - 1;
                    }

                    @Override
                    protected Integer computeNext() {
                        while (++current < docCount.get()) {
                            if (!internalDeletes.get(current)) {
                                return current;
                            }
                        }
                        return endOfData();
                    }
                };
            }
        };
    }

    @Override
    public int countMatches(Query query) throws InterruptedException {
        return matcher.countMatches(query);
    }

    @Override
    public int countMatches(Query query, Predicate<DocId> idFilter) throws InterruptedException {
        return matcher.countMatches(query, idFilter);
    }

    public Map<String, String> getStats(String prefix) {
        Map<String, String> stats = Maps.newHashMap();
        stats.put(prefix + "size", String.valueOf(docCount.get()));
        stats.put(prefix + "terms", String.valueOf(invertedIndex.size()));
        stats.put(prefix + "deletes", String.valueOf(deletes.size()));
        stats.put(prefix + "internal_deletes", String.valueOf(internalDeletes.cardinality()));
        return stats;
    }

}