Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.usergrid.utils.IndexUtils.java

License:Apache License

public static String keywordText(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    StringBuilder builder = new StringBuilder();
    boolean first = true;
    try {//from   www  .j  av a 2  s .c o  m
        while (ts.incrementToken()) {
            if (!first) {
                builder.append(' ');
            }
            first = false;
            builder.append(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        logger.error("Error getting keywords ", e);
    }
    return builder.toString();
}

From source file:perf.TestAnalyzerPerf.java

License:Apache License

private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount)
        throws Exception {
    System.out.println("\nTEST: " + desc);

    // 64 KB buffer
    InputStream is = new FileInputStream(wikiLinesFile);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);

    long startTime = System.currentTimeMillis();
    long sumTime = 0;
    long hash = 0;
    long tokenCount = 0;
    int totCount = warmupCount + runCount;
    for (int i = 0; i < totCount; i++) {

        boolean isWarmup = i < warmupCount;

        if (i % 10000 == 0) {
            System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...",
                    (System.currentTimeMillis() - startTime) / 1000.0, i));
        }//from w  w  w  . j  a v a  2 s.co  m
        String s = reader.readLine();
        long t0 = System.nanoTime();
        TokenStream ts = a.tokenStream("field", new StringReader(s));
        ts.reset();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt;
        if (ts.hasAttribute(PositionIncrementAttribute.class)) {
            posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
        } else {
            posIncAtt = null;
        }
        OffsetAttribute offsetAtt;
        if (ts.hasAttribute(OffsetAttribute.class)) {
            offsetAtt = ts.getAttribute(OffsetAttribute.class);
        } else {
            offsetAtt = null;
        }

        while (ts.incrementToken()) {
            hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length());
            if (posIncAtt != null) {
                hash += 31 * posIncAtt.getPositionIncrement();
            }
            if (offsetAtt != null) {
                hash += 31 * offsetAtt.startOffset();
                hash += 31 * offsetAtt.endOffset();
            }
            if (isWarmup == false) {
                tokenCount++;
            }
        }
        ts.end();
        ts.close();

        if (isWarmup == false) {
            sumTime += System.nanoTime() - t0;
        }
    }
    reader.close();

    System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc,
            (sumTime / 1000000.0), hash, tokenCount));
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//w w w  . jav a  2 s .  c om
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public void addFirstDocuments(String xmlFile) {
    WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile);
    try {/*  w ww.  j  a  va2 s .co  m*/
        wxsp.setPageCallback(new PageCallbackHandler() {
            public void process(WikiPage page) {
                if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage())
                    return;
                List<WordKey> cipheredWords = new ArrayList<WordKey>();
                try {
                    TokenStream ts = analyzer.tokenStream(null,
                            new BufferedReader(new StringReader(page.getText())));
                    try {
                        ts.reset();
                        while (ts.incrementToken()) {
                            String word = ts.getAttribute(CharTermAttribute.class).toString();
                            if (word.length() > 0)
                                cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                        }
                        ts.end();
                    } finally {
                        ts.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())),
                        cipheredWords.toArray(new WordKey[cipheredWords.size()])));
                //store doc in the cloud
                //            cloud.putDoc(""+i, crypto.encryptDocAES(documents[i]));
            }
        });
        wxsp.parse();
        search.buildIndex();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//  ww w . ja  v a  2 s  .  c  om
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<HomPosting> homScores = search.processQuery(cipheredWords);
        for (HomPosting posting : homScores) {
            finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue()));
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public List<Posting> query(String query) {
    try {//from  w w w .j a  v a 2 s  . com
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public void addFirstDocuments(File[] docs) {
    try {//from  ww w .  j  a v  a  2  s . co m
        //         File f = new File(path);
        //         File[] docs = f.listFiles();
        for (int i = 0; i < docs.length; i++) {
            String content = Utils.readFileAsString(docs[i]);
            List<WordKey> cipheredWords = new ArrayList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords);
            storage.putDoc("" + i,
                    crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content))));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:relevantfile.XmlParser.java

License:Open Source License

/********************************************************************************************/
public static String removeStopWordsAndStem(String input) throws IOException {
    /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue"
    ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int"
    ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super",
    "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/
    String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double",
            "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
            "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class",
            "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit",
            "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator",
            "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed",
            "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
            "unsafe", "ushort", "using", "virtual" };
    ArrayList<String> stopWords = new ArrayList<String>();
    for (int k = 0; k < stop_word.length; k++)
        stopWords.add(stop_word[k]);//from www .j  a  va2  s.co m
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input));
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream,
            StopFilter.makeStopSet(Version.LUCENE_46, stopWords));
    tokenStream = new PorterStemFilter(tokenStream);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (sb.length() > 0) {
            sb.append(" ");
        }
        sb.append(token.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return sb.toString();
}

From source file:reviews.searching.SearchReviews.java

License:Apache License

public static void displayTokenStream(TokenStream tokenStream) throws IOException {

    TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);

    while (tokenStream.incrementToken()) {
        System.out.println(termAtt.term());
        System.out.println("Type: " + typeAtt.type());
        System.out.println();/*from w  ww  . ja  v a 2s.  c o  m*/
    }
}

From source file:searching.QueryExpansion.java

/**
 * /*  w  w  w.  ja  v  a  2 s. c  om*/
 * store frequencies of top docs in maps
 * 
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */
public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader)
        throws IOException {

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, Double> map = new TreeMap();

        Integer length = 0;
        Double f;

        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    f = map.get(term);

                    if (f == null) {
                        map.put(term, 1.0);
                    } else {
                        map.put(term, f + 1.0);
                    }
                    length++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_scores[actual_pdocs] = doc_score;

        //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob));
        actual_pdocs++;
    }
}