Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.usergrid.utils.IndexUtils.java

License:Apache License

public static String keywordText(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    StringBuilder builder = new StringBuilder();
    boolean first = true;
    try {//from   www  .j  av a 2  s .c o  m
        while (ts.incrementToken()) {
            if (!first) {
                builder.append(' ');
            }
            first = false;
            builder.append(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        logger.error("Error getting keywords ", e);
    }
    return builder.toString();
}

From source file:perf.TestAnalyzerPerf.java

License:Apache License

private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount)
        throws Exception {
    System.out.println("\nTEST: " + desc);

    // 64 KB buffer
    InputStream is = new FileInputStream(wikiLinesFile);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16);

    long startTime = System.currentTimeMillis();
    long sumTime = 0;
    long hash = 0;
    long tokenCount = 0;
    int totCount = warmupCount + runCount;
    for (int i = 0; i < totCount; i++) {

        boolean isWarmup = i < warmupCount;

        if (i % 10000 == 0) {
            System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...",
                    (System.currentTimeMillis() - startTime) / 1000.0, i));
        }//from w  w  w  . j  a v a  2 s.co  m
        String s = reader.readLine();
        long t0 = System.nanoTime();
        TokenStream ts = a.tokenStream("field", new StringReader(s));
        ts.reset();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt;
        if (ts.hasAttribute(PositionIncrementAttribute.class)) {
            posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
        } else {
            posIncAtt = null;
        }
        OffsetAttribute offsetAtt;
        if (ts.hasAttribute(OffsetAttribute.class)) {
            offsetAtt = ts.getAttribute(OffsetAttribute.class);
        } else {
            offsetAtt = null;
        }

        while (ts.incrementToken()) {
            hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length());
            if (posIncAtt != null) {
                hash += 31 * posIncAtt.getPositionIncrement();
            }
            if (offsetAtt != null) {
                hash += 31 * offsetAtt.startOffset();
                hash += 31 * offsetAtt.endOffset();
            }
            if (isWarmup == false) {
                tokenCount++;
            }
        }
        ts.end();
        ts.close();

        if (isWarmup == false) {
            sumTime += System.nanoTime() - t0;
        }
    }
    reader.close();

    System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc,
            (sumTime / 1000000.0), hash, tokenCount));
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//w w w  . jav a  2 s .  c om
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java

License:Apache License

@Override
public void addFirstDocuments(String xmlFile) {
    WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile);
    try {/*  w ww.  j  a  va2 s .co  m*/
        wxsp.setPageCallback(new PageCallbackHandler() {
            public void process(WikiPage page) {
                if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage())
                    return;
                List<WordKey> cipheredWords = new ArrayList<WordKey>();
                try {
                    TokenStream ts = analyzer.tokenStream(null,
                            new BufferedReader(new StringReader(page.getText())));
                    try {
                        ts.reset();
                        while (ts.incrementToken()) {
                            String word = ts.getAttribute(CharTermAttribute.class).toString();
                            if (word.length() > 0)
                                cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                        }
                        ts.end();
                    } finally {
                        ts.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())),
                        cipheredWords.toArray(new WordKey[cipheredWords.size()])));
                //store doc in the cloud
                //            cloud.putDoc(""+i, crypto.encryptDocAES(documents[i]));
            }
        });
        wxsp.parse();
        search.buildIndex();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java

License:Apache License

@Override
public List<Posting> query(String query) {
    try {//  ww w . ja  v a  2 s  .  c  om
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<HomPosting> homScores = search.processQuery(cipheredWords);
        for (HomPosting posting : homScores) {
            finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue()));
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public List<Posting> query(String query) {
    try {//from  w w w .j a  v a 2 s  . com
        List<Posting> finalScores = new ArrayList<Posting>(12);

        List<WordKey> cipheredWords = new LinkedList<WordKey>();
        TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query)));
        try {
            ts.reset();
            while (ts.incrementToken()) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                if (word.length() > 0)
                    cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
            }
            ts.end();
        } finally {
            ts.close();
        }
        List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords);
        for (CipheredPostingList cipherPostings : cipheredPostings) {
            PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings());

            PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size());
            for (TermFreq tf : tfs.getPostings())
                postings.add(new Posting(tf.getDocId(), tf.getFreq()));
            //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(),
            //   docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths)));

            Posting posting;
            while ((posting = postings.poll()) != null) {
                //if (!removedDocs.containsKey(posting.getDocId())) {
                int j = finalScores.indexOf(posting);
                if (j == -1)
                    finalScores.add(posting);
                else
                    finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore());
            }
        }
        Collections.sort(finalScores);
        if (finalScores.size() > 12)
            return finalScores.subList(0, 12);
        else
            return finalScores;
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java

License:Apache License

public void addFirstDocuments(File[] docs) {
    try {//from  ww w .  j  a v  a  2  s . co m
        //         File f = new File(path);
        //         File[] docs = f.listFiles();
        for (int i = 0; i < docs.length; i++) {
            String content = Utils.readFileAsString(docs[i]);
            List<WordKey> cipheredWords = new ArrayList<WordKey>();
            TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content)));
            try {
                ts.reset();
                while (ts.incrementToken()) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    if (word.length() > 0)
                        cipheredWords.add(new WordKey(crypto.encryptWordKey(word)));
                }
                ts.end();
            } finally {
                ts.close();
            }
            search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords);
            storage.putDoc("" + i,
                    crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content))));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:relevantfile.XmlParser.java

License:Open Source License

/********************************************************************************************/
public static String removeStopWordsAndStem(String input) throws IOException {
    /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue"
    ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int"
    ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super",
    "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/
    String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double",
            "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return",
            "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class",
            "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit",
            "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator",
            "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed",
            "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked",
            "unsafe", "ushort", "using", "virtual" };
    ArrayList<String> stopWords = new ArrayList<String>();
    for (int k = 0; k < stop_word.length; k++)
        stopWords.add(stop_word[k]);//from www .j  a  va2  s.co m
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input));
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET);
    tokenStream = new StopFilter(Version.LUCENE_46, tokenStream,
            StopFilter.makeStopSet(Version.LUCENE_46, stopWords));
    tokenStream = new PorterStemFilter(tokenStream);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        if (sb.length() > 0) {
            sb.append(" ");
        }
        sb.append(token.toString());
    }
    tokenStream.end();
    tokenStream.close();
    return sb.toString();
}

From source file:reviews.searching.SearchReviews.java

License:Apache License

public static void displayTokenStream(TokenStream tokenStream) throws IOException {

    TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);

    while (tokenStream.incrementToken()) {
        System.out.println(termAtt.term());
        System.out.println("Type: " + typeAtt.type());
        System.out.println();/*from w  ww  . ja  v a 2s.  c o  m*/
    }
}

From source file:searching.QueryExpansion.java

/**
 * /*  w  w  w.  ja  v  a  2 s. c  om*/
 * store frequencies of top docs in maps
 * 
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */
public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader)
        throws IOException {

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, Double> map = new TreeMap();

        Integer length = 0;
        Double f;

        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    f = map.get(term);

                    if (f == null) {
                        map.put(term, 1.0);
                    } else {
                        map.put(term, f + 1.0);
                    }
                    length++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_scores[actual_pdocs] = doc_score;

        //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob));
        actual_pdocs++;
    }
}