Example usage for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//w  w w.j  a  v  a  2 s . co m
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:PostgresClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//ww w  . j  ava  2 s .com
                "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection c = null;
    Statement stmt = null;
    Statement stmtU = null;
    try {
        Class.forName("org.postgresql.Driver");
        c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl");
        c.setAutoCommit(false);
        System.out.println("Opened database successfully");
        stmt = c.createStatement();
        stmtU = c.createStatement();
        ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null");

        while (rs.next()) {
            String seq = rs.getString("seq");
            //String rep = rs.getString("rep");
            String body = rs.getString("body");
            //String category = rep;
            String id = seq;
            String message = body;

            //System.out.println("Doc: " + id + "\t" + message);

            Multiset<String> words = ConcurrentHashMultiset.create();

            // extract words from tweet
            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    // if the word is not in the dictionary, skip it
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }
            // Mark : Modified 
            ts.end();
            ts.close();

            // create vector wordId => weight using tfidf
            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            // With the classifier, we get one score for each label 
            // The label with the highest score is the one the tweet is more likely to
            // be associated to
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
                //System.out.print("  " + labels.get(categoryId) + ": " + score);
            }
            //System.out.println(" => " + labels.get(bestCategoryId));
            //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id );
            stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId)
                    + "' WHERE seq = " + id);
        }
        rs.close();
        stmt.close();
        stmtU.close();
        c.commit();
        c.close();
        analyzer.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
}

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 *//*from   w  ww .ja v a  2s  .co  m*/
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
    Collection<String> tokens = new LinkedList<>();
    CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
    tokenizedText.reset();
    while (tokenizedText.incrementToken()) {
        tokens.add(charTermAttribute.toString());
    }
    tokenizedText.end();
    tokenizedText.close();
    return tokens.toArray(new String[tokens.size()]);
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();//  w w w  . jav  a  2  s  .c o  m
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();/*from  w w w .  j  a  va2 s  .  c o m*/
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:at.itbh.bev.apibeans.FinderImpl.java

License:Open Source License

public FullTextQuery constructQuery(EntityManager em, String postalCode, String place, String addressLine,
        String houseId) throws InvalidApiUsageException {
    FullTextEntityManager fullTextEm = Search.getFullTextEntityManager(em);

    if ((Objects.toString(postalCode, "") + Objects.toString(place, "") + Objects.toString(addressLine, "")
            + Objects.toString(houseId, "")).length() == 0) {
        throw new InvalidApiUsageException(
                "At least one parameter must be provided. Coordinates don't count as parameters.");
    }//from  w  w  w  . ja  v  a  2s.c  om

    if (addressLine != null && addressLine.length() < 2 && addressLine.length() > 0) {
        throw new InvalidApiUsageException("The parameter addressLine must consist of at least 2 characters.");
    }

    QueryBuilder b = fullTextEm.getSearchFactory().buildQueryBuilder().forEntity(AdresseDenormalized.class)
            .get();
    List<Query> queries = new ArrayList<>();

    if (postalCode != null && postalCode.length() > 0) {
        queries.add(b.keyword().onField("postalCode").boostedTo(20).matching(postalCode).createQuery());
    }

    if (addressLine != null && addressLine.length() > 0) {
        queries.add(b.keyword().onField("addressLine").matching(addressLine + addressLine + addressLine)
                .createQuery());
        // triple addressLine since in the data source it is also tripled if
        // there is no building or address name
        queries.add(b.keyword().onField("addressLineExact").boostedTo(10)
                .matching(addressLine + addressLine + addressLine).createQuery());
    }

    if (houseId != null && houseId.length() > 0) {
        // if search string contains a number, take the first number in the
        // search string and match with the house number

        Matcher matcher = housenumberPattern.matcher(houseId);
        if (matcher.find()) {
            queries.add(
                    b.keyword().onField("hausnrzahl").boostedTo(50).matching(matcher.group(1)).createQuery());
        }

        if (houseId.matches(".*\\D.*")) {
            queries.add(b.keyword().onField("houseIdExact").matching(houseId).createQuery());
        }

        queries.add(b.keyword().onField("houseId").boostedTo(20).matching(houseId).createQuery());

        TextAnalyzer analyzer = new TextAnalyzer();
        TokenStream stream;
        try {
            stream = analyzer.tokenStream(null, new StringReader(houseId));
            // CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                // if analyzer does not remove everything check hofname and hausnrgebaeudebez
                queries.add(b.keyword().onField("hofname").matching(houseId).createQuery());
                queries.add(b.keyword().onField("hausnrgebaeudebez").matching(houseId).createQuery());
                // System.out.println(cattr.toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        analyzer.close();
    }

    if (place != null && place.length() > 0) {
        queries.add(b.keyword().onField("place").matching(place).createQuery());

        queries.add(b.keyword().onField("municipalityExact").boostedTo(20).matching(place).createQuery());
        queries.add(b.keyword().onField("placeExact").boostedTo(5).matching(place).createQuery());
    }

    @SuppressWarnings("rawtypes")
    BooleanJunction bq = b.bool();
    for (Query item : queries) {
        bq = bq.should(item);
    }

    FullTextQuery fullTextQuery = fullTextEm.createFullTextQuery(bq.createQuery(), AdresseDenormalized.class);
    return fullTextQuery;
}

From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {// www  .  j  a v  a2s  .  c o m
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                result.add(term);
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java

public List<String> search(String field, String value) {
    try {//from   w  ww  . j a va 2s  .  c  o  m
        long start = System.currentTimeMillis();
        TokenStream stream = analyzer.tokenStream(field, new StringReader(value));
        CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        String valor = "";
        while (stream.incrementToken()) {
            valor = valor + attr.toString() + ' ';
        }
        BooleanQuery bq = new BooleanQuery();
        BooleanQuery acronymBq = null;
        String query = "";
        BooleanQuery wrapBq = new BooleanQuery();
        String[] tokens = valor.split(" ");
        for (int i = 0; i < tokens.length; i++) {
            if (tokens.length >= 2) {
                acronymBq = new BooleanQuery();
                switch (i) {
                case 0:
                    acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                    bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    break;
                case 1:
                    acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT);
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT);
                    break;
                default:
                    break;
                }
            } else {
                if (tokens[i].length() > 3) {
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                } else {
                    bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                }
            }
        }

        stream.end();
        stream.close();
        // Aqui termina
        // Cria uma fuzzyquery, ela que far a busca de aproximao

        wrapBq.add(bq, BooleanClause.Occur.MUST);
        if (acronymBq != null) {
            //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query)
            wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT);
        }
        String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms";
        // Pegando os documentos encontrado na pesquisa
        start = System.currentTimeMillis();
        ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs;
        String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms";
        List<String> result = new ArrayList<String>();
        result.add(valor);
        if (hits.length > 0) {
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);
                result.add(hitDoc.get(field));
            }
        }
        result.add(queryTime);
        result.add(searchTime);
        return result;
    } catch (IOException ex) {
        Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public boolean contain(String label) {
    try {//from  ww w.  ja  v a2 s .c o m
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);
        // use the boolean query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.MUST);
        }

        ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label);
        searcher.search(bq, collector);
        boolean ret = collector.isExistQueryLabel();
        reader.close();
        return ret;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return false;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) {
    TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>();
    if (query == null) {
        ret.add(new SimilarLabelQueryResult(null, 1));
        return ret;
    }//www . ja v a2 s  . com
    try {
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);

        // get terms from query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        SynonymMap synMap = SynonymIndex.getSynonymMap();
        HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet);

        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.SHOULD);
            // expand using synonyms
            for (String syn : synMap.getSynonyms(s)) {
                stemer.setCurrent(syn);
                stemer.stem();
                syn = stemer.getCurrent();
                if (expandedQueryTermSet.add(syn)) {
                    term = new Term(LabelDocument.FIELD_LABEL, syn);
                    termQuery = new TermQuery(term);
                    bq.add(termQuery, Occur.SHOULD);
                }
            }
        }

        // search in the label index
        SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet,
                similarity);
        searcher.search(bq, collector);
        ret = collector.getQueryResult();
        searcher.close();
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}