Example usage for org.apache.lucene.analysis.standard StandardTokenizer reset

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer reset.

Prototype

@Override
    public void reset() throws IOException 

Source Link

Usage

From source file:at.ac.tuwien.ifs.lupu.LangDetFilterFactoryTest.java

/**
 * Test of create method, of class LangDetFilterFactory.
 */// w  w  w  .ja  va  2 s . c o  m
@Test
public void testCreate() {
    try {
        System.out.println("create");
        Map<String, String> args = new HashMap<>();
        args.put("languages", "languages.txt");
        args.put("windowWidth", "1");
        LangDetFilterFactory factory = new LangDetFilterFactory(args);
        ResourceLoader loader = new ClasspathResourceLoader(getClass());
        factory.inform(loader);
        StringReader reader = new StringReader(" 34234 voil la France, hello@email.com here is England");
        StandardTokenizer st = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
        st.reset();
        LangDetFilter filter = (LangDetFilter) factory.create(st);
        //filter.reset();

        while (filter.incrementToken()) {
            System.out.println("!!!" + filter.toString());

        }
    } catch (IOException ex) {
        Logger.getLogger(LangDetFilterFactoryTest.class.getName()).log(Level.SEVERE, null, ex);
        fail("Exception thrown");
    }
}

From source file:com.bizosys.hsearch.outpipe.HQueryParser.java

License:Apache License

private static List<String> standardTokenizer(String word) throws ApplicationFault {
    Reader reader = new StringReader(word);
    StandardTokenizer fil = new StandardTokenizer(LuceneConstants.version, reader);
    List<String> lstWord = new ArrayList<String>(3);
    try {// w w  w. ja  v  a 2  s .co m
        word = null;
        CharTermAttribute termA = (CharTermAttribute) fil.getAttribute(CharTermAttribute.class);
        fil.reset();

        while (fil.incrementToken()) {
            word = termA.toString();
            lstWord.add(word);
        }
        reader.close();
    } catch (Exception ex) {
        throw new ApplicationFault(ex);
    }
    return lstWord;
}

From source file:com.github.jiloc.USTweetsAnalyzer.Analyzer_Index.java

/**
 * Take in input a string and tokenize it into an ArrayList of strings(tokens) which is returned 
 * @param text - a string that has to be splited 
 * @return an ArrayList of strings //  w w  w. j av  a  2 s . c o  m
 * @throws IOException 
 */
public ArrayList<String> tokenizeText(String text) throws IOException {
    StringReader reader = new StringReader(text);
    StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_41, reader);
    CharTermAttribute charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class);
    tokenizer.reset();
    ArrayList<String> tokens = new ArrayList<String>();

    while (tokenizer.incrementToken()) {
        tokens.add(charTermAttrib.toString());

    }
    tokenizer.end();
    tokenizer.close();
    //  System.out.println("tokenizetext: "+tokens.toString());
    return tokens;

}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

public ResponsePojo getSuggestions(String userIdStr, String term, String communityIdStrList,
        boolean bIncludeGeo, boolean bIncludeLinkdata, boolean bWantNoAlias) {
    long nSysTime = System.currentTimeMillis();

    ResponsePojo rp = new ResponsePojo();

    ElasticSearchManager gazIndex = ElasticSearchManager.getIndex(entityIndex_);

    // Need to do a quick decomposition of the term to fit in with analyzed strings
    String escapedterm = null;/*ww  w  . j ava  2 s .c o m*/
    StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30,
            new StringReader(ContentUtils.stripDiacritics(term)));
    CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
    StringBuffer sb = new StringBuffer();
    try {
        try {
            st.reset();
            while (st.incrementToken()) {
                if (sb.length() > 0) {
                    sb.append(" +");
                } else {
                    sb.append('+');
                }
                sb.append(luceneEncodeTerm(termAtt.toString()));
            }
        } finally {
            st.close();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
        sb.append('*');
    } //TESTED         
    escapedterm = sb.toString();

    // Create the search query

    SearchRequestBuilder searchOptions = gazIndex.getSearchOptions();
    BaseQueryBuilder queryObj1 = QueryBuilders.queryString(escapedterm)
            .defaultField(EntityFeaturePojoIndexMap.Mapping.RootObject.RootProperties.alias_pri_);

    String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);
    BaseQueryBuilder queryObj2 = QueryBuilders.boolQuery()
            .should(QueryBuilders.termsQuery(EntityFeaturePojo.communityId_, communityIdStrs));

    BaseQueryBuilder queryObj = QueryBuilders.boolQuery().must(queryObj1).must(queryObj2);

    searchOptions.addSort(EntityFeaturePojo.doccount_, SortOrder.DESC);
    searchOptions.addFields(EntityFeaturePojo.disambiguated_name_, EntityFeaturePojo.doccount_,
            EntityFeaturePojo.type_, EntityFeaturePojo.dimension_);
    if (bIncludeGeo) {
        searchOptions.addFields(EntityFeaturePojo.geotag_);
        searchOptions.addFields(EntityFeaturePojo.ontology_type_);
    }
    if (bIncludeLinkdata) {
        searchOptions.addFields(EntityFeaturePojo.linkdata_);
    }

    // Initial alias handling:

    AliasLookupTable aliasTable = null;
    HashMap<String, SearchSuggestPojo> aliasResults = null;
    if (!bWantNoAlias) {
        AliasManager aliasManager = AliasManager.getAliasManager();
        if (null != aliasManager) {
            aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr);
        }
    }
    //TESTED

    // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them 
    List<EntityFeaturePojo> extraEntries = null;
    if (null != aliasTable) {
        extraEntries = checkAliasMasters(aliasTable, escapedterm);
    }
    // (end initial alias handling)

    int nDesiredSize = 20;
    if (null == aliasTable) {
        searchOptions.setSize(nDesiredSize); // will forward all 20
    } else {
        searchOptions.addFields(EntityFeaturePojo.index_);
        searchOptions.setSize(3 * nDesiredSize); // will forward top 20 after de-aliasing

        aliasResults = new HashMap<String, SearchSuggestPojo>();
        // (We use this to ensure we only include each entity once after aliasing)
    }
    //TESTED

    // Perform the search

    SearchResponse rsp = gazIndex.doQuery(queryObj, searchOptions);

    // Format the return values

    SearchHit[] docs = rsp.getHits().getHits();
    DimensionListPojo dimlist = new DimensionListPojo();
    int nDocsAdded = 0;

    if (null != extraEntries) { // Put the alias masters at the top:
        //DEBUG
        //System.out.println(Arrays.toString(extraEntries.toArray()));
        for (EntityFeaturePojo alias : extraEntries) {
            SearchSuggestPojo sp = new SearchSuggestPojo();
            if (null != alias.getDimension()) {
                sp.setDimension(alias.getDimension().toString());
            } else {
                sp.setDimension("What");
            }
            sp.setValue(alias.getDisambiguatedName());
            sp.setType(alias.getType());
            if (bIncludeGeo) {
                sp.setGeotag(alias.getGeotag());
            }
            sp.setOntology_type(alias.getOntology_type());
            dimlist.addSearchSuggestPojo(sp);
        }
    } //TESTED (inc geo)

    if (null != docs) {
        for (SearchHit hit : docs) {
            SearchHitField shf = hit.field(EntityFeaturePojo.disambiguated_name_);
            if (null == shf) { // robustness check, sometimes if the harvester goes wrong this field might be missing
                continue;
            }
            String disname = (String) shf.value();
            String type = (String) hit.field(EntityFeaturePojo.type_).value();
            String dimension = (String) hit.field(EntityFeaturePojo.dimension_).value();
            SearchSuggestPojo sp = new SearchSuggestPojo();

            sp.setValue(disname);
            sp.setDimension(dimension);
            sp.setType(type);
            if (bIncludeGeo) {
                SearchHitField loc = hit.field(EntityFeaturePojo.geotag_);
                if (loc != null)
                    sp.setLocFromES((String) loc.value());
                SearchHitField ont = hit.field(EntityFeaturePojo.ontology_type_);
                if (ont != null)
                    sp.setOntology_type((String) ont.value());
            }
            if (bIncludeLinkdata) {
                SearchHitField linkdata = hit.field(EntityFeaturePojo.linkdata_);
                if (linkdata != null)
                    sp.setLinkdata(linkdata.values());
            }

            // More alias handling
            String index = null;
            if (null != aliasTable) {
                index = (String) hit.field(EntityFeaturePojo.index_).value();
                EntityFeaturePojo alias = aliasTable.getAliasMaster(index);
                if (null != alias) { // Found!
                    if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
                        continue;
                    } else if ((null != alias.getDisambiguatedName()) && (null != alias.getType())) {
                        // (these need to be present)

                        //DEBUG (perf critical)
                        //logger.debug("Alias! Replace " + index + " with " + alias.getIndex());

                        index = alias.getIndex();
                        disname = alias.getDisambiguatedName();
                        type = alias.getType();
                        if (null != alias.getDimension()) {
                            dimension = alias.getDimension().toString();
                        } else { // Guess from type
                            dimension = DimensionUtility.getDimensionByType(type).toString();
                        }
                        // Reset values:
                        sp.setValue(disname);
                        sp.setDimension(dimension);
                        sp.setType(type);
                    }
                }
                SearchSuggestPojo existing = aliasResults.get(index);
                if (null != existing) {

                    //DEBUG (perf critical)
                    //logger.debug("Alias! Remove duplicate " + index);

                    if ((null == existing.getGeotag()) && (null != sp.getGeotag())) {
                        // (if they're both set then sigh just ignore on a first-come-first-served basis)
                        existing.setGeotag(sp.getGeotag());
                        existing.setOntology_type(sp.getOntology_type());
                    } //TESTED
                    if (null != sp.getLinkdata()) { // (here we can just combine the linkdata)
                        if (null == existing.getLinkdata()) {
                            existing.setLinkdata(sp.getLinkdata());
                        } else {
                            existing.getLinkdata().addAll(sp.getLinkdata());
                        }
                    } //TESTED
                    continue; // (ie don't add this guy)
                } else { // add it
                    aliasResults.put(index, sp);
                }
            }
            //TESTED
            // end more alias handing                        

            dimlist.addSearchSuggestPojo(sp);
            // (only adds unique entries, ie handles multiple communities "ok" (only ok
            //  because it doesn't sum the doccounts across multiple communities, you'd probably
            //  want to use facets for that, but it doesn't seem worth it, especially since we're
            //  pretty short on field cache space)

            if (++nDocsAdded >= nDesiredSize) { // (can happen in the de-aliasing case)
                break;
            } //TESTED
        }
    }
    rp.setData(dimlist);
    rp.setResponse(new ResponseObject("Suggestions", true, term));

    if (nSysTime > (lastSuggestLog + 5000)) {
        lastSuggestLog = nSysTime;
        logMsg.setLength(0);
        logMsg.append("knowledge/searchSuggest query=").append(escapedterm);
        logMsg.append(" groups=").append(communityIdStrList);
        logMsg.append(" found=").append(docs.length);
        logMsg.append(" time=").append(System.currentTimeMillis() - nSysTime).append(" ms");
        logger.info(logMsg.toString());
    }
    return rp;
}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

public ResponsePojo getAssociationSuggestions(String userIdStr, String ent1, String verb, String ent2,
        String field, String communityIdStrList, boolean bWantNoAlias) {
    ResponsePojo rp = new ResponsePojo();
    try {//from  w  w  w  .  j  a  v  a 2s .  co m
        // Community ids, needed in a couple of places
        String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);

        // Initial alias handling:
        AliasLookupTable aliasTable = null;
        // Initial alias handling:         
        if (!bWantNoAlias) {
            AliasManager aliasManager = AliasManager.getAliasManager();
            if (null != aliasManager) {
                aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null,
                        userIdStr);
            }
        } //TESTED                              

        ElasticSearchManager esm = ElasticSearchManager.getIndex(assocIndex_);
        SearchRequestBuilder searchOptions = esm.getSearchOptions();
        BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
        boolean bExtraQueryTerms = false;
        String term = "";
        if (!ent1.equals("null")) {
            if (field.equals(AssociationFeaturePojo.entity1_))
                term = ent1;
            else {
                bExtraQueryTerms = true;
                EntityFeaturePojo alias = null;
                if (null != aliasTable) {
                    alias = aliasTable.getAliasMaster(ent1);
                }
                if (null != alias) { // Found!
                    boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity1_index_,
                            alias.getAlias().toArray()));
                } else {
                    boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity1_index_, ent1));
                } //TESTED
            }
        }
        if (!verb.equals("null")) {
            if (field.equals(AssociationFeaturePojo.verb_))
                term = verb;
            else {
                bExtraQueryTerms = true;
                boolQuery.must(QueryBuilders
                        .queryString(new StringBuffer("+").append(verb.replaceAll("\\s+", " +")).toString())
                        .defaultField(AssociationFeaturePojo.verb_));
            }
        }
        if (!ent2.equals("null")) {
            if (field.equals(AssociationFeaturePojo.entity2_))
                term = ent2;
            else {
                bExtraQueryTerms = true;
                EntityFeaturePojo alias = null;
                if (null != aliasTable) {
                    alias = aliasTable.getAliasMaster(ent2);
                }
                if (null != alias) { // Found!
                    boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity2_index_,
                            alias.getAlias().toArray()));
                } else {
                    boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity2_index_, ent2));
                }
            } //TESTED (cut and paste from entity1)
        }

        String escapedterm = null;
        StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30,
                new StringReader(ContentUtils.stripDiacritics(term)));
        CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
        StringBuffer sb = new StringBuffer();
        try {
            try {
                st.reset();
                while (st.incrementToken()) {
                    if (sb.length() > 0) {
                        sb.append(" +");
                    } else {
                        sb.append('+');
                    }
                    sb.append(luceneEncodeTerm(termAtt.toString()));
                }
            } finally {
                st.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
            sb.append('*');
        } //TESTED         

        escapedterm = sb.toString();

        // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them 
        List<EntityFeaturePojo> extraEntries = null;
        BoolQueryBuilder extraQueryTerms = null;
        if (field.startsWith("entity")) {
            String indexField = field.startsWith("entity1") ? "entity1_index" : "entity2_index";
            if (null != aliasTable) {
                extraEntries = checkAliasMasters(aliasTable, escapedterm);
            }
            if (null != extraEntries) {
                extraQueryTerms = QueryBuilders.boolQuery();
                int nExtraTerms = 0;
                Iterator<EntityFeaturePojo> aliasIt = extraEntries.iterator();
                while (aliasIt.hasNext()) {
                    EntityFeaturePojo alias = aliasIt.next();
                    nExtraTerms += alias.getAlias().size();

                    if (!bExtraQueryTerms && (nExtraTerms > 20)) { // If not filtering on event type we'll be more aggressive
                        break;
                    } //TESTED
                    if (bExtraQueryTerms && (nExtraTerms > 60)) { // If the number of terms gets too large bail anyway
                        break;
                    } //TESTED

                    extraQueryTerms.should(QueryBuilders.termsQuery(indexField, alias.getAlias().toArray()));
                    aliasIt.remove();

                } //end loop over entities 
            } //if found new aliases

        } //(if this is an entity lookup) TESTED - including breaking out because of # of terms 

        // (end initial alias handling)

        if (null == extraQueryTerms) {
            boolQuery.must(QueryBuilders.queryString(escapedterm).defaultField(field));
        } else {//(in this case combine the escaped term with the aliases
            extraQueryTerms.should(QueryBuilders.queryString(escapedterm).defaultField(field));
            boolQuery.must(extraQueryTerms);
        } //TESTED
        boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.communityId_, communityIdStrs));

        searchOptions.addSort(AssociationFeaturePojo.doccount_, SortOrder.DESC);

        // Work out which fields to return:
        //TODO (INF-1234) need to work out what to do with quotations and similar here (ie entityX without entityX_index) 
        String returnfield;
        boolean bReturningEntities = true;
        if (field.equals(AssociationFeaturePojo.entity1_)) {
            returnfield = AssociationFeaturePojo.entity1_index_;
            searchOptions.addFields(AssociationFeaturePojo.entity1_index_, AssociationFeaturePojo.doccount_);
        } else if (field.equals(AssociationFeaturePojo.entity2_)) {
            returnfield = AssociationFeaturePojo.entity2_index_;
            searchOptions.addFields(AssociationFeaturePojo.entity2_index_, AssociationFeaturePojo.doccount_);
        } else {
            bReturningEntities = false;
            returnfield = AssociationFeaturePojo.verb_;
            searchOptions.addFields(AssociationFeaturePojo.verb_, AssociationFeaturePojo.verb_category_,
                    AssociationFeaturePojo.doccount_);
        }

        int nNumSuggestionsToReturn = 20;
        if (bReturningEntities && (null != aliasTable)) {
            searchOptions.setSize(3 * nNumSuggestionsToReturn); // we're going to remove some duplicates so get more than we need
        } else { // normal case
            searchOptions.setSize(nNumSuggestionsToReturn);
        }

        SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
        SearchHit[] docs = rsp.getHits().getHits();

        //Currently this code takes the results and puts
        //them into a set so there are no duplicates
        //duplicates occur for example when you search for
        //obama you get obama/quotation/quote1 and obama/travel/spain
        //may want to work this differnt, or atleast sum up
        //frequency
        Set<String> suggestions = new HashSet<String>();

        for (SearchHit hit : docs) {
            SearchHitField retField = hit.field(returnfield); // (this can be null in theory/by mistake)
            if (null != retField) {
                String suggestion = (String) retField.value();
                if (bReturningEntities && (null != aliasTable)) {
                    // More alias handling
                    EntityFeaturePojo alias = aliasTable.getAliasMaster(suggestion);
                    if (null != alias) { // Found!
                        if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
                            continue;
                        } else {
                            // (these need to be present)
                            suggestion = alias.getIndex();
                        }
                    } //TESTED
                } else { // (old code, still valid for verbs or no aliases) 
                    if (returnfield.equals(AssociationFeaturePojo.verb_)
                            && hit.field(AssociationFeaturePojo.verb_category_) != null)
                    //for some reason verb_cat can be null!?!?! i think this is broken (ent1 facebook inc/company verb *)
                    {
                        String verbcat = (String) hit.field(AssociationFeaturePojo.verb_category_).value();
                        suggestion += " (" + verbcat + ")";
                        suggestions.add(verbcat);
                    }
                }
                suggestions.add(suggestion);

                if (suggestions.size() >= nNumSuggestionsToReturn) {
                    break;
                }

            } // (end return string valid)
        } //end loop over suggestions

        // Add any aliases that I couldn't explicity convert to query terms
        if ((null != extraEntries) && (suggestions.size() < nNumSuggestionsToReturn)) {
            for (EntityFeaturePojo alias : extraEntries) {
                suggestions.add(alias.getIndex());
                if (suggestions.size() >= nNumSuggestionsToReturn) {
                    break;
                }
            }
        } //(end add any remaining entries)
          //TESTED         

        String[] suggestionArray = new String[suggestions.size()];
        rp.setData(Arrays.asList(suggestions.toArray(suggestionArray)), (BasePojoApiMap<String>) null);

        String searchTerm = "";
        if (field.equals(AssociationFeaturePojo.entity1_))
            searchTerm = ent1;
        else if (field.equals(AssociationFeaturePojo.verb_))
            searchTerm = verb;
        else
            searchTerm = ent2;

        rp.setResponse(new ResponseObject("Association Suggestions", true, searchTerm));
    } catch (Exception ex) {
        ex.printStackTrace();
        rp.setResponse(new ResponseObject("Association Suggestions", false,
                "Response returned unsuccessfully: " + ex.getMessage()));
    }
    return rp;
}

From source file:ru.mail.sphere.java_hw5_vasilyev.container.TweetsContainerImpl.java

private Map<String, Double> getTweetStatistics(Tweet tweet) {
    Map<String, Double> tweetStats = new HashMap<>();
    try {/*from   ww  w . j a  v a  2s. c  o  m*/
        StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_36,
                AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                new StringReader(tweet.getContent()));
        tokenizer.reset();
        CharTermAttribute attribute = tokenizer.addAttribute(CharTermAttribute.class);
        int countOfMeanings = 0;
        while (tokenizer.incrementToken()) {
            String currentToken = attribute.toString();
            // Check if token is not shorter than 3 characters
            if (currentToken.length() >= 3) {
                tweetStats.put(currentToken, tweetStats.getOrDefault(currentToken, 0.0) + 1.0);
                countOfMeanings++;
            }
        }
        // Normalization
        if (countOfMeanings > 0) {
            for (String key : tweetStats.keySet()) {
                tweetStats.put(key, tweetStats.get(key) / countOfMeanings);
            }
        }
    } catch (IOException ex) {
        Logger.getLogger(TweetsContainerImpl.class.getName()).log(Level.SEVERE, null, ex);
    }
    return tweetStats;
}