Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {//from  ww w  .j  av a2s .  c  om
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:edu.utsa.sifter.DocMaker.java

License:Apache License

public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer,
        boolean testEmpty) throws IOException {
    final Field f = new Field("body", body, BodyOptions);
    if (testEmpty) {
        // System.out.println("testing if doc has empty body");
        final TokenStream toks = f.tokenStream(analyzer);
        toks.reset();/* w  ww  . jav  a 2  s.  c  o m*/
        if (!toks.incrementToken()) {
            // System.out.println("empty body, won't index");
            toks.close();
            return false;
        }
    }
    doc.add(new Field("body", body, BodyOptions));
    doc.add(new LongField("body-len", body.length(), Field.Store.YES));
    return true;
}

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string//w w  w  .j  a va 2  s  .  c o m
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {//from   ww w . j ava  2 s .c o  m
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:engine.easy.analyzer.EasySearchAnalyzer.java

License:Apache License

private static void printResult(String text, Analyzer analyzer) throws IOException {

    int tokenCount = 0;
    TokenStream tokenStream = analyzer.tokenStream("FIELDNAME", new StringReader(text)); // this method will used for token streams
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);
    while (tokenStream.incrementToken()) {
        tokenCount++;//www.j  a  v a 2s .c  om
        String tokenText = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        System.out.println(" >> Token " + tokenCount + ": " + tokenText);
    }
}

From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java

License:Apache License

/**
 * Count the token stream tokens.//from  w w w .  ja v  a  2 s.c o  m
 * 
 * @return it returns the no:of stream tokens.
  * @throws IOException if the file would have any IO operation.
 */
private static int[] countTokenStream(TokenStream tokenStream) throws IOException {
    int v[] = new int[2];
    HashSet countTokenStreamBuffer = new HashSet();
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    while (tokenStream.incrementToken()) {
        v[0]++;
        countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
    }

    v[1] = countTokenStreamBuffer.size();
    tokenStream.reset();
    countTokenStreamBuffer.clear();
    return v;
}

From source file:fr.ericlab.sondy.core.DataManipulation.java

License:Open Source License

public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage,
        boolean lemmatization, AppVariables appVariables) {
    try {/* ww w  .j a  va  2  s.  c  o  m*/
        Connection connection;
        Class.forName("com.mysql.jdbc.Driver").newInstance();
        connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(),
                appVariables.configuration.getUsername(), appVariables.configuration.getPassword());
        Statement statement = connection.createStatement();
        Statement statement2 = connection.createStatement();

        String lemStr = (lemmatization) ? "_lem1" : "_lem0";
        statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName
                + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;");
        //            statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)");

        ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMin.next();
        Timestamp tMin = rsTMin.getTimestamp(1);
        ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from "
                + appVariables.configuration.getSchema() + "." + datasetName + "_messages;");
        rsTMax.next();
        Timestamp tMax = rsTMax.getTimestamp(1);
        Timestamp tRef = new Timestamp(0);
        long base = (tMin.getTime() - tRef.getTime()) * 1L;
        long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L;
        long streamDurationMin = (streamDuration / 1000) / 60;

        String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/"
                + intervalDuration + "min-" + stemLanguage;
        path += (lemmatization) ? "-lem1" : "-lem0";
        path += "-" + ngram + "gram";
        String pathMention = path + "-m";

        FSDirectory indexGlobal = FSDirectory.open(new File(path));
        FSDirectory indexMention = FSDirectory.open(new File(pathMention));
        Analyzer analyzer;
        Properties props = new Properties();
        props.put("annotators", "tokenize,ssplit,parse,lemma");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation annotation;
        if (stemLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            Class cl;
            if (stemLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = stemLanguage.substring(0, 2).toLowerCase();
                cl = Class
                        .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet());
            }
        }
        IndexWriterConfig configGlobal;
        IndexWriterConfig configMention;
        ShingleAnalyzerWrapper shingleAnalyzer = null;
        if (ngram > 1) {
            shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false);
            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer);
        } else {
            configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        }
        IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal);
        IndexWriter wMention = new IndexWriter(indexMention, configMention);

        int docId = 0;
        for (int i = 0; i < streamDurationMin; i += intervalDuration) {
            statement = connection.createStatement();
            long infBound = base + i * 60 * 1000L;
            long supBound = base + (i + intervalDuration) * 60 * 1000L;
            Timestamp infTime = new Timestamp(infBound);
            Timestamp supTime = new Timestamp(supBound);
            ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM "
                    + appVariables.configuration.getSchema() + "." + datasetName
                    + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'");
            String globalContent = new String();
            String mentionContent = new String();
            String timestamps = new String();
            NumberFormat formatter = new DecimalFormat("00000000");
            int bulk = 0;
            String bulkString = "";
            boolean mention;
            while (rs.next()) {
                String message = rs.getString(1).toLowerCase();
                mention = message.contains("@");
                if (lemmatization) {
                    annotation = new Annotation(message);
                    message = "";
                    pipeline.annotate(annotation);
                    List<CoreMap> lem = annotation.get(SentencesAnnotation.class);
                    for (CoreMap l : lem) {
                        for (CoreLabel token : l.get(TokensAnnotation.class)) {
                            message += token.get(LemmaAnnotation.class) + " ";
                        }
                    }
                }
                if (ngram > 1) {
                    String processedMessage = "";
                    TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message));
                    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
                    while (tokenStream.incrementToken()) {
                        String termToken = charTermAttribute.toString();
                        if (!termToken.contains("_")) {
                            processedMessage += termToken.replace(" ", "=") + " ";
                        }
                    }
                    message = processedMessage;
                }
                bulk++;
                if (bulk < _BULK_SIZE_) {
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\"),";
                } else {
                    bulk = 0;
                    bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\""
                            + rs.getString(3) + "\");";
                    statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                            + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_"
                            + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                            + bulkString);
                    bulkString = "";
                }
                globalContent += message + "\n";
                if (mention) {
                    mentionContent += message + "\n";
                }
                timestamps += rs.getString(2) + "\n";
            }
            if (bulk > 0 && bulkString.length() > 0) {
                statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "."
                        + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                        + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES"
                        + bulkString.substring(0, bulkString.length() - 1) + ";");
            }
            Document docGlobal = new Document();
            docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wGlobal.addDocument(docGlobal);
            wGlobal.commit();
            Document docMention = new Document();
            docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.YES));
            docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED));
            wMention.addDocument(docMention);
            wMention.commit();

            File textFile = new File(path + "/input/" + formatter.format(docId) + ".text");
            FileUtils.writeStringToFile(textFile, globalContent);
            File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time");
            FileUtils.writeStringToFile(timeFile, timestamps);

            docId++;
            statement.close();
        }
        statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (time_slice);");
        statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema()
                + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram
                + "gram (msg_text);");
        statement2.close();
        connection.close();
        wGlobal.close();
        wMention.close();
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (NoSuchMethodException | SecurityException | IllegalArgumentException
            | InvocationTargetException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java

License:Open Source License

private void extractTerms(String e) {
    Set<String> s = new LinkedHashSet<String>();
    TokenStream ts = analyzer.tokenStream("", new StringReader(e));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    try {//from w w  w .  jav a 2 s.  c  o  m
        while (ts.incrementToken()) {
            s.add(termAtt.term());
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    /*
    Token token;
    try {
        while ((token = ts.next()) != null) {
       s.add(token.termText());
        }
    } catch (IOException ex) {
       ex.printStackTrace();
    }
    */
    map.put(e, s);
}

From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java

License:Open Source License

/**
 * Takes a gloss-like string (text) and returns it tokenized.
 * with:/*from   w w w  .ja  v a 2 s . com*/
 * - stopwords
 * - lower case
 * - porter stemmer
 */
protected Set<String> tokenizeGloss(String s) throws IOException {
    Set<String> result = new HashSet<String>();
    // I am affraid that I am reimplementing the StandardAnalizer...
    TokenStream ts = new PorterStemFilter(
            new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        result.add(termAtt.term());
    }
    return result;
}

From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java

License:Open Source License

/**
 * add all words contained in toAnalyse into words collection. Words are stemmed.
 * @param toAnalyse : the string to be analysed
 * @param words : the collection to add extracted words
 *//* w ww.  j  a v  a 2  s  .  c o m*/
protected void analyseString(String toAnalyse, Collection<String> words) {
    TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
    TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
    try {
        while (tokenS.incrementToken()) {
            words.add(termAtt.term());
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}