org.netbeans.modules.jackpot30.impl.duplicates.ComputeDuplicates.java Source code

Introduction

Here is the source code for org.netbeans.modules.jackpot30.impl.duplicates.ComputeDuplicates.java
Source

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright 2009-2010 Sun Microsystems, Inc. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common
 * Development and Distribution License("CDDL") (collectively, the
 * "License"). You may not use this file except in compliance with the
 * License. You can obtain a copy of the License at
 * http://www.netbeans.org/cddl-gplv2.html
 * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
 * specific language governing permissions and limitations under the
 * License.  When distributing the software, include this License Header
 * Notice in each file and include the License file at
 * nbbuild/licenses/CDDL-GPL-2-CP.  Sun designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Sun in the GPL Version 2 section of the License file that
 * accompanied this code. If applicable, add the following below the
 * License Header, with the fields enclosed by brackets [] replaced by
 * your own identifying information:
 * "Portions Copyrighted [year] [name of copyright owner]"
 *
 * If you wish your version of this file to be governed by only the CDDL
 * or only the GPL Version 2, indicate your decision by adding
 * "[Contributor] elects to include this software in this distribution
 * under the [CDDL or GPL Version 2] license." If you do not indicate a
 * single choice of license, a recipient has the option to distribute
 * your version of this file under either the CDDL, the GPL Version 2 or
 * to extend the choice of license to its licensees as provided above.
 * However, if you add GPL Version 2 code and therefore, elected the GPL
 * Version 2 license, then the option applies only if the new code is
 * made subject to such option by the copyright holder.
 *
 * Contributor(s):
 *
 * Portions Copyrighted 2009-2010 Sun Microsystems, Inc.
 */
package org.netbeans.modules.jackpot30.impl.duplicates;

import com.sun.source.tree.CompilationUnitTree;
import com.sun.source.tree.IdentifierTree;
import com.sun.source.tree.NewClassTree;
import com.sun.source.tree.Tree;
import com.sun.source.tree.VariableTree;
import com.sun.source.util.SourcePositions;
import com.sun.source.util.TreePath;
import com.sun.source.util.TreePathScanner;
import com.sun.source.util.Trees;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.security.DigestOutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.lang.model.element.Element;
import javax.lang.model.element.Modifier;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.netbeans.api.annotations.common.CheckForNull;
import org.netbeans.api.annotations.common.NonNull;
import org.netbeans.api.java.classpath.ClassPath;
import org.netbeans.api.java.classpath.GlobalPathRegistry;
import org.netbeans.api.java.source.CompilationInfo;
import org.netbeans.api.progress.ProgressHandle;
import org.netbeans.modules.jackpot30.common.api.LuceneHelpers.BitSetCollector;
import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesCustomIndexerImpl;
import org.netbeans.modules.jackpot30.impl.duplicates.indexing.DuplicatesIndex;
import org.netbeans.modules.parsing.impl.indexing.CacheFolder;
import org.openide.filesystems.FileObject;
import org.openide.filesystems.FileUtil;
import org.openide.filesystems.URLMapper;
import org.openide.util.Exceptions;

/**
 *
 * @author lahvac
 */
public class ComputeDuplicates {

    public Iterator<? extends DuplicateDescription> computeDuplicatesForAllOpenedProjects(ProgressHandle progress,
            AtomicBoolean cancel) throws IOException {
        Set<URL> urls = new HashSet<URL>();

        for (ClassPath cp : GlobalPathRegistry.getDefault().getPaths(ClassPath.SOURCE)) {
            for (ClassPath.Entry e : cp.entries()) {
                urls.add(e.getURL());
            }
        }

        long start = System.currentTimeMillis();
        try {
            return computeDuplicates(urls, progress, cancel);
        } finally {
            System.err.println("duplicates for all open projects: " + (System.currentTimeMillis() - start));
        }
    }

    public Iterator<? extends DuplicateDescription> computeDuplicates(Set<URL> forURLs, ProgressHandle progress,
            AtomicBoolean cancel) throws IOException {
        Map<IndexReader, FileObject> readers2Roots = new LinkedHashMap<IndexReader, FileObject>();

        progress.progress("Updating indices");

        for (URL u : forURLs) {
            try {
                //TODO: needs to be removed for server mode
                new DuplicatesCustomIndexerImpl.FactoryImpl().updateIndex(u, cancel); //TODO: show updating progress to the user

                File cacheRoot = cacheRoot(u);

                File dir = new File(cacheRoot, DuplicatesIndex.NAME);

                if (dir.listFiles() != null && dir.listFiles().length > 0) {
                    IndexReader reader = IndexReader.open(FSDirectory.open(dir), true);

                    readers2Roots.put(reader, URLMapper.findFileObject(u));
                }
            } catch (IOException ex) {
                Exceptions.printStackTrace(ex);
            }
        }

        progress.progress("Searching for duplicates");

        MultiReader r = new MultiReader(readers2Roots.keySet().toArray(new IndexReader[0]));

        List<String> dd = new ArrayList<String>(getDuplicatedValues(r, "duplicatesGeneralized", cancel));

        sortHashes(dd);

        //TODO: only show valuable duplicates?:
        //        dd = dd.subList(0, dd.size() / 10 + 1);

        return new DuplicatesIterator(readers2Roots, dd, 2);
    }

    public static Iterator<? extends DuplicateDescription> XXXduplicatesOf(
            Map<IndexReader, FileObject> readers2Roots, Collection<String> hashes) {
        List<String> hashesList = new ArrayList<String>(hashes);
        sortHashes(hashesList);
        return new DuplicatesIterator(readers2Roots, hashesList, 1);
    }

    private static File cacheRoot(URL sourceRoot) throws IOException {
        FileObject dataFolder = CacheFolder.getDataFolder(sourceRoot);
        FileObject cacheFO = dataFolder.getFileObject(DuplicatesIndex.NAME + "/" + DuplicatesIndex.VERSION);
        File cache = cacheFO != null ? FileUtil.toFile(cacheFO) : null;

        return cache;
    }

    private static final class DuplicatesIterator implements Iterator<DuplicateDescription> {
        private final Map<IndexReader, FileObject> readers2Roots;
        private final Iterator<String> duplicateCandidates;
        private final int minDuplicates;
        private final List<DuplicateDescription> result = new LinkedList<DuplicateDescription>();

        public DuplicatesIterator(Map<IndexReader, FileObject> readers2Roots, Iterable<String> duplicateCandidates,
                int minDuplicates) {
            this.readers2Roots = readers2Roots;
            this.duplicateCandidates = duplicateCandidates.iterator();
            this.minDuplicates = minDuplicates;
        }

        private DuplicateDescription nextDescription() throws IOException {
            while (duplicateCandidates.hasNext()) {
                String longest = duplicateCandidates.next();
                List<Span> foundDuplicates = new LinkedList<Span>();

                Query query = new TermQuery(new Term("duplicatesGeneralized", longest));

                for (Entry<IndexReader, FileObject> e : readers2Roots.entrySet()) {
                    Searcher s = new IndexSearcher(e.getKey());
                    BitSet matchingDocuments = new BitSet(e.getKey().maxDoc());
                    Collector c = new BitSetCollector(matchingDocuments);

                    s.search(query, c);

                    for (int docNum = matchingDocuments.nextSetBit(0); docNum >= 0; docNum = matchingDocuments
                            .nextSetBit(docNum + 1)) {
                        final Document doc = e.getKey().document(docNum);
                        int pos = Arrays.binarySearch(doc.getValues("duplicatesGeneralized"), longest);

                        if (pos < 0) {
                            continue;
                        }

                        String spanSpec = doc.getValues("duplicatesPositions")[pos];
                        String relPath = doc.getField("duplicatesPath").stringValue();

                        for (String spanPart : spanSpec.split(";")) {
                            Span span = Span.of(e.getValue().getFileObject(relPath), spanPart);

                            if (span != null) {
                                foundDuplicates.add(span);
                            }
                        }
                    }
                }

                if (foundDuplicates.size() >= minDuplicates) {
                    DuplicateDescription current = DuplicateDescription.of(foundDuplicates, getValue(longest),
                            longest);
                    boolean add = true;

                    for (Iterator<DuplicateDescription> it = result.iterator(); it.hasNext();) {
                        DuplicateDescription existing = it.next();

                        if (subsumes(existing, current)) {
                            add = false;
                            break;
                        }

                        if (subsumes(current, existing)) {
                            //can happen? (note that the duplicates are sorted by value)
                            it.remove();
                        }
                    }

                    if (add) {
                        result.add(current);
                        return current;
                    }
                }

            }
            return null;
        }

        private DuplicateDescription next;

        public boolean hasNext() {
            if (next == null) {
                try {
                    next = nextDescription();
                } catch (IOException ex) {
                    Exceptions.printStackTrace(ex);
                }
            }

            return next != null;
        }

        public DuplicateDescription next() {
            if (!hasNext()) {
                throw new NoSuchElementException();
            }

            DuplicateDescription r = next;

            next = null;
            return r;
        }

        public void remove() {
            throw new UnsupportedOperationException("Not supported.");
        }

    }

    private static List<String> getDuplicatedValues(IndexReader ir, String field, AtomicBoolean cancel)
            throws IOException {
        List<String> values = new ArrayList<String>();
        TermEnum terms = ir.terms(new Term(field));
        //while (terms.next()) {
        do {
            if (cancel.get())
                return Collections.emptyList();

            final Term term = terms.term();

            if (!field.equals(term.field())) {
                break;
            }

            if (terms.docFreq() < 2)
                continue;

            values.add(term.text());
        } while (terms.next());
        return values;
    }

    private static long getValue(String encoded) {
        return Long.parseLong(encoded.substring(encoded.lastIndexOf(":") + 1));
    }

    private static void sortHashes(List<String> hashes) {
        Collections.sort(hashes, new Comparator<String>() {
            public int compare(String arg0, String arg1) {
                return (int) Math.signum(getValue(arg1) - getValue(arg0));
            }
        });
    }

    private static boolean subsumes(DuplicateDescription bigger, DuplicateDescription smaller) {
        Set<FileObject> bFiles = new HashSet<FileObject>();

        for (Span s : bigger.dupes) {
            bFiles.add(s.file);
        }

        Set<FileObject> sFiles = new HashSet<FileObject>();

        for (Span s : smaller.dupes) {
            sFiles.add(s.file);
        }

        if (!bFiles.equals(sFiles))
            return false;

        Span testAgainst = bigger.dupes.get(0);

        for (Span s : smaller.dupes) {
            if (s.file == testAgainst.file) {
                if ((testAgainst.startOff <= s.startOff && testAgainst.endOff > s.endOff)
                        || (testAgainst.startOff < s.startOff && testAgainst.endOff >= s.endOff)) {
                    return true;
                }
            }
        }

        return false;
    }

    public static Map<String, long[]> encodeGeneralized(CompilationInfo info) {
        return encodeGeneralized(info.getTrees(), info.getCompilationUnit());
    }

    public static Map<String, long[]> encodeGeneralized(final Trees trees, final CompilationUnitTree cut) {
        final SourcePositions sp = trees.getSourcePositions();
        final Map<String, Collection<Long>> positions = new HashMap<String, Collection<Long>>();

        new TreePathScanner<Void, Void>() {
            @Override
            public Void scan(Tree tree, Void p) {
                if (tree == null)
                    return null;
                if (getCurrentPath() != null) {
                    DigestOutputStream baos = null;
                    PrintWriter out = null;
                    try {
                        baos = new DigestOutputStream(new ByteArrayOutputStream(),
                                MessageDigest.getInstance("MD5"));
                        out = new PrintWriter(new OutputStreamWriter(baos, "UTF-8"));
                        GeneralizePattern gen = new GeneralizePattern(out, trees);
                        gen.scan(new TreePath(getCurrentPath(), tree), null);
                        out.close();
                        if (gen.value >= MINIMAL_VALUE) {
                            StringBuilder text = new StringBuilder();
                            byte[] bytes = baos.getMessageDigest().digest();
                            for (int cntr = 0; cntr < 4; cntr++) {
                                text.append(String.format("%02X", bytes[cntr]));
                            }
                            text.append(':').append(gen.value);
                            String enc = text.toString();
                            Collection<Long> spanSpecs = positions.get(enc);
                            if (spanSpecs == null) {
                                positions.put(enc, spanSpecs = new LinkedList<Long>());
                                //                            } else {
                                //                                spanSpecs.append(";");
                            }
                            long start = sp.getStartPosition(cut, tree);
                            //                            spanSpecs.append(start).append(":").append(sp.getEndPosition(cut, tree) - start);
                            spanSpecs.add(start);
                            spanSpecs.add(sp.getEndPosition(cut, tree));
                        }
                    } catch (UnsupportedEncodingException ex) {
                        Exceptions.printStackTrace(ex);
                    } catch (NoSuchAlgorithmException ex) {
                        Exceptions.printStackTrace(ex);
                    } finally {
                        try {
                            baos.close();
                        } catch (IOException ex) {
                            Exceptions.printStackTrace(ex);
                        }
                        out.close();
                    }
                }
                return super.scan(tree, p);
            }
        }.scan(cut, null);

        Map<String, long[]> result = new TreeMap<String, long[]>();

        for (Entry<String, Collection<Long>> e : positions.entrySet()) {
            long[] spans = new long[e.getValue().size()];
            int idx = 0;

            for (Long l : e.getValue()) {
                spans[idx++] = l;
            }

            result.put(e.getKey(), spans);
        }

        return result;
    }

    private static final class GeneralizePattern extends TreePathScanner<Void, Void> {

        public final Map<Tree, Tree> tree2Variable = new HashMap<Tree, Tree>();
        private final Map<Element, String> element2Variable = new HashMap<Element, String>();
        private final PrintWriter to;
        private final Trees javacTrees;
        private long value;

        private int currentVariableIndex = 0;

        public GeneralizePattern(PrintWriter to, Trees javacTrees) {
            this.to = to;
            this.javacTrees = javacTrees;
        }

        private @NonNull String getVariable(@NonNull Element el) {
            String var = element2Variable.get(el);

            if (var == null) {
                element2Variable.put(el, var = "$" + currentVariableIndex++);
            }

            return var;
        }

        private boolean shouldBeGeneralized(@NonNull Element el) {
            if (el.getModifiers().contains(Modifier.PRIVATE)) {
                return true;
            }

            switch (el.getKind()) {
            case LOCAL_VARIABLE:
            case EXCEPTION_PARAMETER:
            case PARAMETER:
                return true;
            }

            return false;
        }

        @Override
        public Void scan(Tree tree, Void p) {
            if (tree != null) {
                to.append(tree.getKind().name());
                value++;
            }
            return super.scan(tree, p);
        }

        @Override
        public Void visitIdentifier(IdentifierTree node, Void p) {
            Element e = javacTrees.getElement(getCurrentPath());

            if (e != null && shouldBeGeneralized(e)) {
                to.append(getVariable(e));
                value--;
                return null;
            } else {
                to.append(node.getName());
            }

            return super.visitIdentifier(node, p);
        }

        @Override
        public Void visitVariable(VariableTree node, Void p) {
            Element e = javacTrees.getElement(getCurrentPath());

            if (e != null && shouldBeGeneralized(e)) {
                to.append(getVariable(e));
            } else {
                to.append(node.getName());
            }

            return super.visitVariable(node, p);
        }

        @Override
        public Void visitNewClass(NewClassTree node, Void p) {
            return null;
        }

    }

    private static final int MINIMAL_VALUE = 10;

    public static final class DuplicateDescription {

        public final List<Span> dupes;
        public final long value;
        public final String hash;

        private DuplicateDescription(List<Span> dupes, long value, String hash) {
            this.dupes = dupes;
            this.value = value;
            this.hash = hash;
        }

        public static DuplicateDescription of(List<Span> dupes, long value, String hash) {
            return new DuplicateDescription(dupes, value, hash);
        }
    }

    public static final class Span {
        public final FileObject file;
        public final int startOff;
        public final int endOff;

        public Span(FileObject file, int startOff, int endOff) {
            this.file = file;
            this.startOff = startOff;
            this.endOff = endOff;
        }

        public static @CheckForNull Span of(FileObject file, String spanSpec) {
            String[] split = spanSpec.split(":");
            int start = Integer.valueOf(split[0]);
            int end = start + Integer.valueOf(split[1]);
            if (start < 0 || end < 0)
                return null; //XXX

            return new Span(file, start, end);
        }

    }
}