Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//  w w w  . j  ava 2s.com
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:PostgresClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(/*from   w w w  .j a v  a  2  s  .  com*/
                "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection c = null;
    Statement stmt = null;
    Statement stmtU = null;
    try {
        Class.forName("org.postgresql.Driver");
        c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl");
        c.setAutoCommit(false);
        System.out.println("Opened database successfully");
        stmt = c.createStatement();
        stmtU = c.createStatement();
        ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null");

        while (rs.next()) {
            String seq = rs.getString("seq");
            //String rep = rs.getString("rep");
            String body = rs.getString("body");
            //String category = rep;
            String id = seq;
            String message = body;

            //System.out.println("Doc: " + id + "\t" + message);

            Multiset<String> words = ConcurrentHashMultiset.create();

            // extract words from tweet
            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    // if the word is not in the dictionary, skip it
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }
            // Mark : Modified 
            ts.end();
            ts.close();

            // create vector wordId => weight using tfidf
            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            // With the classifier, we get one score for each label 
            // The label with the highest score is the one the tweet is more likely to
            // be associated to
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
                //System.out.print("  " + labels.get(categoryId) + ": " + score);
            }
            //System.out.println(" => " + labels.get(bestCategoryId));
            //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id );
            stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId)
                    + "' WHERE seq = " + id);
        }
        rs.close();
        stmt.close();
        stmtU.close();
        c.commit();
        c.close();
        analyzer.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
}

From source file:analysis.FtpFilePathAnalyzer.java

License:Apache License

public static void main(String[] args) {
    Analyzer ana = new FtpFilePathAnalyzer();
    String test2 = "c++c++";
    StringReader reader = new StringReader(test2);
    TokenStream ts = ana.tokenStream("path", reader);
    try {/* w  w w.ja  v a  2  s  .  c o  m*/
        while (ts.incrementToken()) {
            TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
            OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts
                    .getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
            System.out.print("(" + offsetAtt.startOffset() + "," + offsetAtt.endOffset() + ") ["
                    + posIncrAtt.getPositionIncrement() + "," + typeAtt.type() + "] " + "[" + termAtt.term()
                    + "]");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:at.ac.tuwien.ifs.myluceneanalyzers.fa.algorithm.PersianDictionaryCountCompoundWord.java

@SuppressWarnings({ "resource", "deprecation" })
private String stem(String input) throws IOException {
    String output = "";
    Reader reader = new StringReader(input);
    Tokenizer source = new StandardTokenizer(Version.LUCENE_4_10_3, reader);
    TokenStream tokenStream = new PersianStemFilter(source);

    CharTermAttribute charTermAttributeGreedy = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  w w  .j  a  v  a 2  s.  com
    while (tokenStream.incrementToken()) {
        output = output + " " + charTermAttributeGreedy.toString();

    }
    return output.trim();
}

From source file:at.newmedialab.lmf.util.solr.suggestion.service.FieldAnalyzerService.java

License:Apache License

/**
 * analyzes string like the default field
 * @param df the name of the default field
 * @param s the string to analyze/*from   ww  w.  ja v  a2 s .  c  om*/
 * @return
 */
public static String analyzeString(SolrCore core, String df, String s) {
    try {
        TokenStream ts = core.getSchema().getFieldType(df).getQueryAnalyzer().tokenStream(df,
                new StringReader(s));
        StringBuffer b = new StringBuffer();
        ts.reset();
        while (ts.incrementToken()) {
            b.append(" ");
            CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class);
            b.append(attr);
        }
        return b.toString().trim();
    } catch (IOException e) {
        e.printStackTrace();
        return s;
    }
}

From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java

License:Apache License

private void updateWeightHighlighting() {
    // remove previous highlighting
    removeHighLights(weightingHighLights);
    if (weightHighlightBox.isSelected()) {
        if (inputDataObjects.getTemplateVector() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Template vector file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }//from  w  ww  .j  a  v a  2s .  co m
        if (inputDataObjects.getInputData() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Input data file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }

        SOMLibTemplateVector tv = inputDataObjects.getTemplateVector();
        InputData data = inputDataObjects.getInputData();
        InputDatum input = data.getInputDatum(currentInput);

        double maxValue = data.getMaxValue();
        double minValue = data.getMinValue();
        double span = maxValue - minValue;

        // init paints
        Palette p = paletteSelectionPanel.getSelectedPalette();
        int paletteLength = p.getNumberOfColours();
        weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength];
        for (int i = 0; i < weightPaints.length; i++) {
            weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i));
        }

        String text = textPane.getText();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        try {
            while (stream.incrementToken()) {
                TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class);
                if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(),
                        "<APOSTROPHE>")) {
                    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
                    String term = termAttribute.term();
                    if (tv.containsLabel(term)) {
                        int index = tv.getIndex(term);
                        double value = input.getVector().getQuick(index);
                        int colorIndex = (int) (paletteLength / 4d
                                + relativeValue(minValue, span, value) * paletteLength / 2d);
                        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
                        offsetAttribute.startOffset();
                        Object tag = highlighter.addHighlight(offsetAttribute.startOffset(),
                                offsetAttribute.endOffset(), weightPaints[colorIndex]);
                        weightingHighLights.add(tag);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BadLocationException e) {
            e.printStackTrace();
        }
    }
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46);

    List<String> result = new ArrayList<>();

    try {/*from   w w w  .  j  a  va2s  .co  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer();

    List<String> result = new ArrayList<>();

    try {//from  w w  w .j  a v a2  s .  c  om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java

public List<String> search(String field, String value) {
    try {//from w w  w  .j av a 2 s  .c o m
        long start = System.currentTimeMillis();
        TokenStream stream = analyzer.tokenStream(field, new StringReader(value));
        CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        String valor = "";
        while (stream.incrementToken()) {
            valor = valor + attr.toString() + ' ';
        }
        BooleanQuery bq = new BooleanQuery();
        BooleanQuery acronymBq = null;
        String query = "";
        BooleanQuery wrapBq = new BooleanQuery();
        String[] tokens = valor.split(" ");
        for (int i = 0; i < tokens.length; i++) {
            if (tokens.length >= 2) {
                acronymBq = new BooleanQuery();
                switch (i) {
                case 0:
                    acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                    bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    break;
                case 1:
                    acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT);
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT);
                    break;
                default:
                    break;
                }
            } else {
                if (tokens[i].length() > 3) {
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                } else {
                    bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                }
            }
        }

        stream.end();
        stream.close();
        // Aqui termina
        // Cria uma fuzzyquery, ela que far a busca de aproximao

        wrapBq.add(bq, BooleanClause.Occur.MUST);
        if (acronymBq != null) {
            //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query)
            wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT);
        }
        String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms";
        // Pegando os documentos encontrado na pesquisa
        start = System.currentTimeMillis();
        ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs;
        String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms";
        List<String> result = new ArrayList<String>();
        result.add(valor);
        if (hits.length > 0) {
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);
                result.add(hitDoc.get(field));
            }
        }
        result.add(queryTime);
        result.add(searchTime);
        return result;
    } catch (IOException ex) {
        Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:byrne.mitre.MitreQuery.java

License:Apache License

public void run() {

    try {//  w w  w . j  av  a 2 s . c  o m

        TokenStream tokenStream = analyzer.tokenStream("ngrams", new StringReader(entry.getFullName()));

        BooleanQuery bq = new BooleanQuery();
        while (tokenStream.incrementToken()) {
            Term t = new Term("ngrams", tokenStream.getAttribute(TermAttribute.class).term());
            bq.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
        }

        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(bq, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        for (int i = 0; i < hits.length; ++i) {

            int docId = hits[i].doc;
            Document d = searcher.doc(docId);

            out.write(entry.getID() + "|" + d.get("id") + "|" + df.format(hits[i].score) + "\n");
        }
    } catch (IOException IOE) {
    }
}