org.netbeans.modules.jackpot30.impl.duplicates.indexing.RemoteDuplicatesIndex.java Source code

Introduction

Here is the source code for org.netbeans.modules.jackpot30.impl.duplicates.indexing.RemoteDuplicatesIndex.java
Source

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2010 Oracle and/or its affiliates. All rights reserved.
 *
 * Oracle and Java are registered trademarks of Oracle and/or its affiliates.
 * Other names may be trademarks of their respective owners.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common
 * Development and Distribution License("CDDL") (collectively, the
 * "License"). You may not use this file except in compliance with the
 * License. You can obtain a copy of the License at
 * http://www.netbeans.org/cddl-gplv2.html
 * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
 * specific language governing permissions and limitations under the
 * License.  When distributing the software, include this License Header
 * Notice in each file and include the License file at
 * nbbuild/licenses/CDDL-GPL-2-CP.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the GPL Version 2 section of the License file that
 * accompanied this code. If applicable, add the following below the
 * License Header, with the fields enclosed by brackets [] replaced by
 * your own identifying information:
 * "Portions Copyrighted [year] [name of copyright owner]"
 *
 * If you wish your version of this file to be governed by only the CDDL
 * or only the GPL Version 2, indicate your decision by adding
 * "[Contributor] elects to include this software in this distribution
 * under the [CDDL or GPL Version 2] license." If you do not indicate a
 * single choice of license, a recipient has the option to distribute
 * your version of this file under either the CDDL, the GPL Version 2 or
 * to extend the choice of license to its licensees as provided above.
 * However, if you add GPL Version 2 code and therefore, elected the GPL
 * Version 2 license, then the option applies only if the new code is
 * made subject to such option by the copyright holder.
 *
 * Contributor(s):
 *
 * Portions Copyrighted 2010 Sun Microsystems, Inc.
 */

package org.netbeans.modules.jackpot30.impl.duplicates.indexing;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.codeviation.pojson.Pojson;
import org.netbeans.modules.jackpot30.common.api.LuceneHelpers.BitSetCollector;
import org.netbeans.modules.jackpot30.impl.duplicates.ComputeDuplicates.DuplicateDescription;
import org.netbeans.modules.jackpot30.impl.duplicates.ComputeDuplicates.Span;
import org.netbeans.modules.jackpot30.remoting.api.LocalCache;
import org.netbeans.modules.jackpot30.remoting.api.LocalCache.Task;
import org.netbeans.modules.jackpot30.remoting.api.RemoteIndex;
import org.netbeans.modules.jackpot30.remoting.api.WebUtilities;
import org.openide.filesystems.FileObject;
import org.openide.filesystems.URLMapper;
import org.openide.util.Exceptions;

/**
 *
 * @author lahvac
 */
@SuppressWarnings("ClassWithMultipleLoggers")
public class RemoteDuplicatesIndex {

    private static final Logger TIMER = Logger.getLogger("TIMER");

    public static List<DuplicateDescription> findDuplicates(Map<String, long[]> hashes, FileObject currentFile,
            AtomicBoolean cancel) throws IOException, URISyntaxException {
        return translate(hashes, findHashOccurrences(hashes.keySet(), currentFile, cancel), currentFile);
    }

    private static Map<String, Map<RemoteIndex, Collection<String>>> findHashOccurrences(
            Collection<? extends String> hashes, FileObject currentFile, AtomicBoolean cancel)
            throws IOException, URISyntaxException {
        Map<URI, Collection<RemoteIndex>> indices = new LinkedHashMap<URI, Collection<RemoteIndex>>();

        for (RemoteIndex ri : RemoteIndex.loadIndices()) {
            try {
                URI uri = ri.remote.toURI();
                Collection<RemoteIndex> list = indices.get(uri);

                if (list == null) {
                    indices.put(uri, list = new ArrayList<RemoteIndex>());
                }

                list.add(ri);
            } catch (URISyntaxException ex) {
                Exceptions.printStackTrace(ex);
            }
        }

        Map<String, Map<RemoteIndex, Collection<String>>> result = new LinkedHashMap<String, Map<RemoteIndex, Collection<String>>>();
        long localTime = 0;
        long remoteTime = 0;

        for (RemoteIndex ri : RemoteIndex.loadIndices()) {
            if (cancel.get())
                return Collections.emptyMap();

            Set<String> toProcess = new LinkedHashSet<String>(hashes);
            Map<String, Map<String, Collection<? extends String>>> indexResult = new LinkedHashMap<String, Map<String, Collection<? extends String>>>();

            long locS = System.currentTimeMillis();
            indexResult.putAll(findHashOccurrencesInLocalCache(ri, toProcess, cancel));
            localTime += System.currentTimeMillis() - locS;

            toProcess.removeAll(indexResult.keySet());

            if (!toProcess.isEmpty()) {
                long remS = System.currentTimeMillis();
                Map<String, Map<String, Collection<? extends String>>> remoteResults = findHashOccurrencesRemote(
                        ri.remote.toURI(), toProcess, cancel);
                remoteTime += System.currentTimeMillis() - remS;

                Map<String, Map<String, Collection<? extends String>>> toSave = new LinkedHashMap<String, Map<String, Collection<? extends String>>>(
                        remoteResults);

                for (String hash : toProcess) {
                    if (!toSave.containsKey(hash)) {
                        toSave.put(hash, Collections.<String, Collection<? extends String>>emptyMap());
                    }
                }

                if (cancel.get())
                    return Collections.emptyMap();

                saveToLocalCache(ri, toSave);

                indexResult.putAll(remoteResults);
            }

            for (Entry<String, Map<String, Collection<? extends String>>> e : indexResult.entrySet()) {
                Map<RemoteIndex, Collection<String>> hashResult = result.get(e.getKey());

                if (hashResult == null) {
                    result.put(e.getKey(), hashResult = new LinkedHashMap<RemoteIndex, Collection<String>>());
                }

                for (Entry<String, Collection<? extends String>> insideHash : e.getValue().entrySet()) {
                    if (cancel.get())
                        return Collections.emptyMap();

                    Collection<String> dupes = hashResult.get(ri);

                    if (dupes == null) {
                        hashResult.put(ri, dupes = new LinkedHashSet<String>());
                    }

                    dupes.addAll(insideHash.getValue());
                }
            }
        }

        TIMER.log(Level.FINE, "local hash duplicates", new Object[] { currentFile, localTime });
        TIMER.log(Level.FINE, "remote hash duplicates", new Object[] { currentFile, remoteTime });
        return result;
    }

    private static Map<String, Map<String, Collection<? extends String>>> findHashOccurrencesRemote(URI remoteIndex,
            Iterable<? extends String> hashes, AtomicBoolean cancel) {
        try {
            String indexURL = remoteIndex.toASCIIString();
            URI u = new URI(indexURL + "/duplicates/findDuplicates?hashes="
                    + WebUtilities.escapeForQuery(Pojson.save(hashes)));
            String hashesMap = WebUtilities.requestStringResponse(u, cancel);

            if (hashesMap == null || cancel.get()) {
                //some kind of error while getting the duplicates (cannot access remote server)?
                //ignore:
                return Collections.emptyMap();
            }
            return Pojson.load(LinkedHashMap.class, hashesMap);
        } catch (URISyntaxException ex) {
            //XXX: better handling?
            Exceptions.printStackTrace(ex);
            return Collections.emptyMap();
        }
    }

    private static Map<String, Map<String, Collection<? extends String>>> findHashOccurrencesInLocalCache(
            RemoteIndex ri, final Iterable<? extends String> hashes, AtomicBoolean cancel)
            throws IOException, URISyntaxException {
        return LocalCache.runOverLocalCache(ri,
                new Task<IndexReader, Map<String, Map<String, Collection<? extends String>>>>() {
                    @Override
                    public Map<String, Map<String, Collection<? extends String>>> run(IndexReader reader,
                            AtomicBoolean cancel) throws IOException {
                        Map<String, Map<String, Collection<String>>> result = new LinkedHashMap<String, Map<String, Collection<String>>>();

                        for (Entry<String, Collection<? extends String>> e : containsHash(reader, hashes, cancel)
                                .entrySet()) {
                            if (cancel.get())
                                return Collections.emptyMap();

                            Map<String, Collection<String>> forHash = result.get(e.getKey());

                            if (forHash == null) {
                                result.put(e.getKey(), forHash = new LinkedHashMap<String, Collection<String>>());
                            }

                            for (String path : e.getValue()) {
                                String segment = path.substring(0, path.indexOf('/'));

                                path = path.substring(path.indexOf('/') + 1);

                                Collection<String> list = forHash.get(segment);

                                if (list == null) {
                                    forHash.put(segment, list = new LinkedList<String>());
                                }

                                list.add(path);
                            }
                        }

                        return (Map) result; //XXX
                    }
                }, Collections.<String, Map<String, Collection<? extends String>>>emptyMap(), cancel);
    }

    private static synchronized void saveToLocalCache(RemoteIndex ri,
            final Map<String, Map<String, Collection<? extends String>>> what)
            throws IOException, URISyntaxException {
        LocalCache.saveToLocalCache(ri, new Task<IndexWriter, Void>() {
            @Override
            public Void run(IndexWriter w, AtomicBoolean cancel) throws IOException {
                for (Entry<String, Map<String, Collection<? extends String>>> e : what.entrySet()) {
                    Document doc = new Document();

                    doc.add(new Field("hash", e.getKey(), Store.YES, Index.NOT_ANALYZED));

                    for (Entry<String, Collection<? extends String>> pe : e.getValue().entrySet()) {
                        for (String path : pe.getValue()) {
                            doc.add(new Field("path", pe.getKey() + "/" + path, Store.YES, Index.NO));
                        }
                    }

                    w.addDocument(doc);
                }

                return null;
            }
        });
    }

    private static List<DuplicateDescription> translate(Map<String, long[]> hashes,
            Map<String, Map<RemoteIndex, Collection<String>>> occ, FileObject currentFile) {
        Map<String, Map<RemoteIndex, Collection<String>>> sorted = hashMap();
        Map<long[], DuplicateDescription> result = new LinkedHashMap<long[], DuplicateDescription>();
        List<long[]> seen = new LinkedList<long[]>();

        sorted.putAll(occ);

        OUTER: for (Entry<String, Map<RemoteIndex, Collection<String>>> e : occ.entrySet()) {
            long[] currentSpan = hashes.get(e.getKey());

            for (Iterator<Entry<long[], DuplicateDescription>> it = result.entrySet().iterator(); it.hasNext();) {
                Entry<long[], DuplicateDescription> span = it.next();

                if (span.getKey()[0] <= currentSpan[0] && span.getKey()[1] >= currentSpan[1]) {
                    continue OUTER;
                }

                if (currentSpan[0] <= span.getKey()[0] && currentSpan[1] >= span.getKey()[1]) {
                    it.remove();
                }
            }

            if (currentSpan[0] == (-1) || currentSpan[1] == (-1))
                continue;

            seen.add(currentSpan);

            String longest = e.getKey();
            List<Span> foundDuplicates = new LinkedList<Span>();

            for (Entry<RemoteIndex, Collection<String>> root2Occurrences : e.getValue().entrySet()) {
                FileObject localRoot = URLMapper.findFileObject(root2Occurrences.getKey().getLocalFolder());

                for (String cand : root2Occurrences.getValue()) {
                    FileObject o = localRoot.getFileObject(cand);

                    if (o == null)
                        continue; //XXX log!
                    if (areEquivalent(currentFile, o))
                        continue;

                    foundDuplicates.add(new Span(o, -1, -1));
                }
            }

            if (foundDuplicates.isEmpty())
                continue;

            DuplicateDescription current = DuplicateDescription.of(foundDuplicates, getValue(longest), longest);

            result.put(currentSpan, current);
        }

        return new LinkedList<DuplicateDescription>(result.values());
    }

    private static boolean areEquivalent(FileObject f1, FileObject f2) {
        return f1.equals(f2);
    }

    private static long getValue(String encoded) {
        return Long.parseLong(encoded.substring(encoded.lastIndexOf(":") + 1));
    }

    private static <T> TreeMap<String, T> hashMap() {
        return new TreeMap<String, T>(new Comparator<String>() {
            public int compare(String arg0, String arg1) {
                return (int) Math.signum(getValue(arg1) - getValue(arg0));
            }
        });
    }

    private static Map<String, Collection<? extends String>> containsHash(IndexReader reader,
            Iterable<? extends String> hashes, AtomicBoolean cancel) throws IOException {
        Map<String, Collection<? extends String>> result = new LinkedHashMap<String, Collection<? extends String>>();

        for (String hash : hashes) {
            if (cancel.get())
                return Collections.emptyMap();

            Collection<String> found = new LinkedList<String>();
            Query query = new TermQuery(new Term("hash", hash));
            Searcher s = new IndexSearcher(reader);
            BitSet matchingDocuments = new BitSet(reader.maxDoc());
            Collector c = new BitSetCollector(matchingDocuments);

            s.search(query, c);

            boolean wasFound = false;

            for (int docNum = matchingDocuments.nextSetBit(0); docNum >= 0; docNum = matchingDocuments
                    .nextSetBit(docNum + 1)) {
                if (cancel.get())
                    return Collections.emptyMap();

                final Document doc = reader.document(docNum);

                found.addAll(Arrays.asList(doc.getValues("path")));
                wasFound = true;
            }

            if (wasFound) {
                result.put(hash, found);
            }
        }

        return result;
    }

}