Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {//from  ww w  .j  av a2s .  c  om
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:edu.utsa.sifter.DocMaker.java

License:Apache License

public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer,
        boolean testEmpty) throws IOException {
    final Field f = new Field("body", body, BodyOptions);
    if (testEmpty) {
        // System.out.println("testing if doc has empty body");
        final TokenStream toks = f.tokenStream(analyzer);
        toks.reset();/* w  ww  . jav  a 2  s.  c  o m*/
        if (!toks.incrementToken()) {
            // System.out.println("empty body, won't index");
            toks.close();
            return false;
        }
    }
    doc.add(new Field("body", body, BodyOptions));
    doc.add(new LongField("body-len", body.length(), Field.Store.YES));
    return true;
}

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string//w w  w  .j  a va 2  s  .  c o m
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {//from   ww w . j ava  2 s .c o  m
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:engine.easy.analyzer.EasySearchAnalyzer.java

License:Apache License

private static void printResult(String text, Analyzer analyzer) throws IOException {

    int tokenCount = 0;
    TokenStream tokenStream = analyzer.tokenStream("FIELDNAME", new StringReader(text)); // this method will used for token streams
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);
    while (tokenStream.incrementToken()) {
        tokenCount++;//www.j  a  v a 2s .c  om
        String tokenText = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        System.out.println(" >> Token " + tokenCount + ": " + tokenText);
    }
}

From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java

License:Apache License

/**
 * Count the token stream tokens.//from  w w w .  ja v  a  2 s.c o  m
 * 
 * @return it returns the no:of stream tokens.
  * @throws IOException if the file would have any IO operation.
 */
private static int[] countTokenStream(TokenStream tokenStream) throws IOException {
    int v[] = new int[2];
    HashSet countTokenStreamBuffer = new HashSet();
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    while (tokenStream.incrementToken()) {
        v[0]++;
        countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
    }

    v[1] = countTokenStreamBuffer.size();
    tokenStream.reset();
    countTokenStreamBuffer.clear();
    return v;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage,
        boolean lemmatization, AppVariables appVariables) {
    try {/* ww w  .j a  va  2  s.  c  o  m*/
        Connection connection;
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(),
                appVariables.configuration.getUsername(), appVariables.configuration.getPassword());
        Statement statement = connection.createStatement();
        Statement statement2 = connection.createStatement();

        String lemStr = (lemmatization) ? "_lem1" : "_lem0";
        statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName
                + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;");
        //            statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)");

        ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMin.next();
        Timestamp tMin = rsTMin.getTimestamp(1);
        ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMax.next();
        Timestamp tMax = rsTMax.getTimestamp(1);
        Timestamp tRef = new Timestamp(0);
        long base = (tMin.getTime() - tRef.getTime()) * 1L;
        long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L;
        long streamDurationMin = (streamDuration / 1000) / 60;

        String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/"
                + intervalDuration + "min-" + stemLanguage;
        path += (lemmatization) ? "-lem1" : "-lem0";
        path += "-" + ngram + "gram";
        String pathMention = path + "-m";

        FSDirectory indexGlobal = FSDirectory.open(new File(path));
        FSDirectory indexMention = FSDirectory.open(new File(pathMention));
        Analyzer analyzer;
        Properties props = new Properties();
        props.put("annotators", "tokenize,ssplit,parse,lemma");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation annotation;
        if (stemLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            Class cl;
            if (stemLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = stemLanguage.substring(0, 2).toLowerCase();
                cl = Class
                        .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet());
            }
        }
        IndexWriterConfig configGlobal;
        IndexWriterConfig configMention;
        ShingleAnalyzerWrapper shingleAnalyzer = null;
        if (ngram > 1) {
            shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false);
            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
        } else {
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        }
        IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal);
        IndexWriter wMention = new IndexWriter(indexMention, configMention);

        int docId = 0;
        for (int i = 0; i < streamDurationMin; i += intervalDuration) {
            statement = connection.createStatement();
            long infBound = base + i * 60 * 1000L;
            long supBound = base + (i + intervalDuration) * 60 * 1000L;
            Timestamp infTime = new Timestamp(infBound);
            Timestamp supTime = new Timestamp(supBound);
            ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM "
                    + appVariables.configuration.getSchema() + "." + datasetName
                    + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'");
            String globalContent = new String();
            String mentionContent = new String();
            String timestamps = new String();
            NumberFormat formatter = new DecimalFormat("00000000");
            int bulk = 0;
            String bulkString = "";
            boolean mention;
            while (rs.next()) {
                String message = rs.getString(1).toLowerCase();
                mention = message.contains("@");
                if (lemmatization) {
                    annotation = new Annotation(message);
                    message = "";
                    pipeline.annotate(annotation);
                    List<CoreMap> lem = annotation.get(SentencesAnnotation.class);
                    for (CoreMap l : lem) {
                        for (CoreLabel token : l.get(TokensAnnotation.class)) {
                            message += token.get(LemmaAnnotation.class) + " ";
                        }
                    }
                }
                if (ngram > 1) {
                    String processedMessage = "";
                    TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message));
                    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
                    while (tokenStream.incrementToken()) {
                        String termToken = charTermAttribute.toString();
                        if (!termToken.contains("_")) {
                            processedMessage += termToken.replace(" ", "=") + " ";
                        }
                    }
                    message = processedMessage;
                }
                bulk++;
                if (bulk < _BULK_SIZE_) {
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\"),";
                } else {
                    bulk = 0;
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\");";
                    statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                            + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_"
                            + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                            + bulkString);
                    bulkString = "";
                }
                globalContent += message + "\n";
                if (mention) {
                    mentionContent += message + "\n";
                }
                timestamps += rs.getString(2) + "\n";
            }
            if (bulk > 0 && bulkString.length() > 0) {
                statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                        + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                        + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                        + bulkString.substring(0, bulkString.length() - 1) + ";");
            }
            Document docGlobal = new Document();
            docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wGlobal.addDocument(docGlobal);
            wGlobal.commit();
            Document docMention = new Document();
            docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wMention.addDocument(docMention);
            wMention.commit();

            File textFile = new File(path + "/input/" + formatter.format(docId) + ".text");
            FileUtils.writeStringToFile(textFile, globalContent);
            File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time");
            FileUtils.writeStringToFile(timeFile, timestamps);

            docId++;
            statement.close();
        }
        statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (time_slice);");
        statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (msg_text);");
        statement2.close();
        connection.close();
        wGlobal.close();
        wMention.close();
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (NoSuchMethodException | SecurityException | IllegalArgumentException
            | InvocationTargetException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java

License:Open Source License

private void extractTerms(String e) {
    Set<String> s = new LinkedHashSet<String>();
    TokenStream ts = analyzer.tokenStream("", new StringReader(e));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    try {//from w w  w .  jav a 2 s.  c  o  m
        while (ts.incrementToken()) {
            s.add(termAtt.term());
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    /*
    Token token;
    try {
        while ((token = ts.next()) != null) {
       s.add(token.termText());
        }
    } catch (IOException ex) {
       ex.printStackTrace();
    }
    */
    map.put(e, s);
}

From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java

License:Open Source License

/**
 * Takes a gloss-like string (text) and returns it tokenized.
 * with:/*from   w w w  .ja  v a 2 s . com*/
 * - stopwords
 * - lower case
 * - porter stemmer
 */
protected Set<String> tokenizeGloss(String s) throws IOException {
    Set<String> result = new HashSet<String>();
    // I am affraid that I am reimplementing the StandardAnalizer...
    TokenStream ts = new PorterStemFilter(
            new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        result.add(termAtt.term());
    }
    return result;
}

From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java

License:Open Source License

/**
 * add all words contained in toAnalyse into words collection. Words are stemmed.
 * @param toAnalyse : the string to be analysed
 * @param words : the collection to add extracted words
 *//* w ww.  j  a v  a 2  s  .  c o m*/
protected void analyseString(String toAnalyse, Collection<String> words) {
    TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
    TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
    try {
        while (tokenS.incrementToken()) {
            words.add(termAtt.term());
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}