Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//  w w w  . j  ava 2s.com
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:PostgresClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(/*from   w w w  .j a v  a  2  s  .  com*/
                "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection c = null;
    Statement stmt = null;
    Statement stmtU = null;
    try {
        Class.forName("org.postgresql.Driver");
        c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl");
        c.setAutoCommit(false);
        System.out.println("Opened database successfully");
        stmt = c.createStatement();
        stmtU = c.createStatement();
        ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null");

        while (rs.next()) {
            String seq = rs.getString("seq");
            //String rep = rs.getString("rep");
            String body = rs.getString("body");
            //String category = rep;
            String id = seq;
            String message = body;

            //System.out.println("Doc: " + id + "\t" + message);

            Multiset<String> words = ConcurrentHashMultiset.create();

            // extract words from tweet
            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    // if the word is not in the dictionary, skip it
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }
            // Mark : Modified 
            ts.end();
            ts.close();

            // create vector wordId => weight using tfidf
            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            // With the classifier, we get one score for each label 
            // The label with the highest score is the one the tweet is more likely to
            // be associated to
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
                //System.out.print("  " + labels.get(categoryId) + ": " + score);
            }
            //System.out.println(" => " + labels.get(bestCategoryId));
            //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id );
            stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId)
                    + "' WHERE seq = " + id);
        }
        rs.close();
        stmt.close();
        stmtU.close();
        c.commit();
        c.close();
        analyzer.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
}

From source file:analysis.FtpFilePathAnalyzer.java

License:Apache License

public static void main(String[] args) {
    Analyzer ana = new FtpFilePathAnalyzer();
    String test2 = "c++c++";
    StringReader reader = new StringReader(test2);
    TokenStream ts = ana.tokenStream("path", reader);
    try {/* w  w w.ja  v a  2  s  .  c o  m*/
        while (ts.incrementToken()) {
            TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
            OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts
                    .getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
            System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") ["
                    + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term()
                    + "]");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:at.ac.tuwien.ifs.myluceneanalyzers.fa.algorithm.PersianDictionaryCountCompoundWord.java

@SuppressWarnings({ "resource", "deprecation" })
private String stem(String input) throws IOException {
    String output = "";
    Reader reader = new StringReader(input);
    Tokenizer source = new StandardTokenizer(Version.LUCENE_4_10_3, reader);
    TokenStream tokenStream = new PersianStemFilter(source);

    CharTermAttribute charTermAttributeGreedy = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  w w  .j  a  v  a 2  s.  com
    while (tokenStream.incrementToken()) {
        output = output + " " + charTermAttributeGreedy.toString();

    }
    return output.trim();
}

From source file:at.newmedialab.lmf.util.solr.suggestion.service.FieldAnalyzerService.java

License:Apache License

/**
 * analyzes string like the default field
 * @param df the name of the default field
 * @param s the string to analyze/*from   ww  w.  ja v  a2 s .  c  om*/
 * @return
 */
public static String analyzeString(SolrCore core, String df, String s) {
    try {
        TokenStream ts = core.getSchema().getFieldType(df).getQueryAnalyzer().tokenStream(df,
                new StringReader(s));
        StringBuffer b = new StringBuffer();
        ts.reset();
        while (ts.incrementToken()) {
            b.append(" ");
            CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class);
            b.append(attr);
        }
        return b.toString().trim();
    } catch (IOException e) {
        e.printStackTrace();
        return s;
    }
}

From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java

License:Apache License

private void updateWeightHighlighting() {
    // remove previous highlighting
    removeHighLights(weightingHighLights);
    if (weightHighlightBox.isSelected()) {
        if (inputDataObjects.getTemplateVector() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Template vector file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }//from  w  ww  .j  a  v a  2s .  co m
        if (inputDataObjects.getInputData() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Input data file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }

        SOMLibTemplateVector tv = inputDataObjects.getTemplateVector();
        InputData data = inputDataObjects.getInputData();
        InputDatum input = data.getInputDatum(currentInput);

        double maxValue = data.getMaxValue();
        double minValue = data.getMinValue();
        double span = maxValue - minValue;

        // init paints
        Palette p = paletteSelectionPanel.getSelectedPalette();
        int paletteLength = p.getNumberOfColours();
        weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength];
        for (int i = 0; i < weightPaints.length; i++) {
            weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i));
        }

        String text = textPane.getText();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        try {
            while (stream.incrementToken()) {
                TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class);
                if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(),
                        "<APOSTROPHE>")) {
                    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
                    String term = termAttribute.term();
                    if (tv.containsLabel(term)) {
                        int index = tv.getIndex(term);
                        double value = input.getVector().getQuick(index);
                        int colorIndex = (int) (paletteLength / 4d
                                + relativeValue(minValue, span, value) * paletteLength / 2d);
                        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
                        offsetAttribute.startOffset();
                        Object tag = highlighter.addHighlight(offsetAttribute.startOffset(),
                                offsetAttribute.endOffset(), weightPaints[colorIndex]);
                        weightingHighLights.add(tag);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BadLocationException e) {
            e.printStackTrace();
        }
    }
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46);

    List<String> result = new ArrayList<>();

    try {/*from   w w w  .  j  a  va2s  .co  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer();

    List<String> result = new ArrayList<>();

    try {//from  w w  w .j  a v a2  s .  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java

public List<String> search(String field, String value) {
    try {//from w w  w  .j av a 2 s  .c o m
        long start = System.currentTimeMillis();
        TokenStream stream = analyzer.tokenStream(field, new StringReader(value));
        CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        String valor = "";
        while (stream.incrementToken()) {
            valor = valor + attr.toString() + ' ';
        }
        BooleanQuery bq = new BooleanQuery();
        BooleanQuery acronymBq = null;
        String query = "";
        BooleanQuery wrapBq = new BooleanQuery();
        String[] tokens = valor.split(" ");
        for (int i = 0; i < tokens.length; i++) {
            if (tokens.length >= 2) {
                acronymBq = new BooleanQuery();
                switch (i) {
                case 0:
                    acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                    bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    break;
                case 1:
                    acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT);
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT);
                    break;
                default:
                    break;
                }
            } else {
                if (tokens[i].length() > 3) {
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                } else {
                    bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                }
            }
        }

        stream.end();
        stream.close();
        // Aqui termina
        // Cria uma fuzzyquery, ela que far a busca de aproximao

        wrapBq.add(bq, BooleanClause.Occur.MUST);
        if (acronymBq != null) {
            //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query)
            wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT);
        }
        String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms";
        // Pegando os documentos encontrado na pesquisa
        start = System.currentTimeMillis();
        ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs;
        String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms";
        List<String> result = new ArrayList<String>();
        result.add(valor);
        if (hits.length > 0) {
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);
                result.add(hitDoc.get(field));
            }
        }
        result.add(queryTime);
        result.add(searchTime);
        return result;
    } catch (IOException ex) {
        Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:byrne.mitre.MitreQuery.java

License:Apache License

public void run() {

    try {//  w w  w . j  av  a 2 s . c  o m

        TokenStream tokenStream = analyzer.tokenStream("ngrams", new StringReader(entry.getFullName()));

        BooleanQuery bq = new BooleanQuery();
        while (tokenStream.incrementToken()) {
            Term t = new Term("ngrams", tokenStream.getAttribute(TermAttribute.class).term());
            bq.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
        }

        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(bq, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        for (int i = 0; i < hits.length; ++i) {

            int docId = hits[i].doc;
            Document d = searcher.doc(docId);

            out.write(entry.getID() + "|" + d.get("id") + "|" + df.format(hits[i].score) + "\n");
        }
    } catch (IOException IOE) {
    }
}