Example usage for org.apache.lucene.analysis.standard StandardTokenizer close

List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardTokenizer close.

Prototype

@Override
    public void close() throws IOException 

Source Link

Usage

From source file:com.github.jiloc.USTweetsAnalyzer.Analyzer_Index.java

/**
 * Take in input a string and tokenize it into an ArrayList of strings(tokens) which is returned 
 * @param text - a string that has to be splited 
 * @return an ArrayList of strings /*w  w  w.jav  a2  s.  c om*/
 * @throws IOException 
 */
public ArrayList<String> tokenizeText(String text) throws IOException {
    StringReader reader = new StringReader(text);
    StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_41, reader);
    CharTermAttribute charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class);
    tokenizer.reset();
    ArrayList<String> tokens = new ArrayList<String>();

    while (tokenizer.incrementToken()) {
        tokens.add(charTermAttrib.toString());

    }
    tokenizer.end();
    tokenizer.close();
    //  System.out.println("tokenizetext: "+tokens.toString());
    return tokens;

}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

public ResponsePojo getSuggestions(String userIdStr, String term, String communityIdStrList,
        boolean bIncludeGeo, boolean bIncludeLinkdata, boolean bWantNoAlias) {
    long nSysTime = System.currentTimeMillis();

    ResponsePojo rp = new ResponsePojo();

    ElasticSearchManager gazIndex = ElasticSearchManager.getIndex(entityIndex_);

    // Need to do a quick decomposition of the term to fit in with analyzed strings
    String escapedterm = null;/*  ww  w .  jav a 2 s  . com*/
    StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30,
            new StringReader(ContentUtils.stripDiacritics(term)));
    CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
    StringBuffer sb = new StringBuffer();
    try {
        try {
            st.reset();
            while (st.incrementToken()) {
                if (sb.length() > 0) {
                    sb.append(" +");
                } else {
                    sb.append('+');
                }
                sb.append(luceneEncodeTerm(termAtt.toString()));
            }
        } finally {
            st.close();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
        sb.append('*');
    } //TESTED         
    escapedterm = sb.toString();

    // Create the search query

    SearchRequestBuilder searchOptions = gazIndex.getSearchOptions();
    BaseQueryBuilder queryObj1 = QueryBuilders.queryString(escapedterm)
            .defaultField(EntityFeaturePojoIndexMap.Mapping.RootObject.RootProperties.alias_pri_);

    String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);
    BaseQueryBuilder queryObj2 = QueryBuilders.boolQuery()
            .should(QueryBuilders.termsQuery(EntityFeaturePojo.communityId_, communityIdStrs));

    BaseQueryBuilder queryObj = QueryBuilders.boolQuery().must(queryObj1).must(queryObj2);

    searchOptions.addSort(EntityFeaturePojo.doccount_, SortOrder.DESC);
    searchOptions.addFields(EntityFeaturePojo.disambiguated_name_, EntityFeaturePojo.doccount_,
            EntityFeaturePojo.type_, EntityFeaturePojo.dimension_);
    if (bIncludeGeo) {
        searchOptions.addFields(EntityFeaturePojo.geotag_);
        searchOptions.addFields(EntityFeaturePojo.ontology_type_);
    }
    if (bIncludeLinkdata) {
        searchOptions.addFields(EntityFeaturePojo.linkdata_);
    }

    // Initial alias handling:

    AliasLookupTable aliasTable = null;
    HashMap<String, SearchSuggestPojo> aliasResults = null;
    if (!bWantNoAlias) {
        AliasManager aliasManager = AliasManager.getAliasManager();
        if (null != aliasManager) {
            aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr);
        }
    }
    //TESTED

    // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them 
    List<EntityFeaturePojo> extraEntries = null;
    if (null != aliasTable) {
        extraEntries = checkAliasMasters(aliasTable, escapedterm);
    }
    // (end initial alias handling)

    int nDesiredSize = 20;
    if (null == aliasTable) {
        searchOptions.setSize(nDesiredSize); // will forward all 20
    } else {
        searchOptions.addFields(EntityFeaturePojo.index_);
        searchOptions.setSize(3 * nDesiredSize); // will forward top 20 after de-aliasing

        aliasResults = new HashMap<String, SearchSuggestPojo>();
        // (We use this to ensure we only include each entity once after aliasing)
    }
    //TESTED

    // Perform the search

    SearchResponse rsp = gazIndex.doQuery(queryObj, searchOptions);

    // Format the return values

    SearchHit[] docs = rsp.getHits().getHits();
    DimensionListPojo dimlist = new DimensionListPojo();
    int nDocsAdded = 0;

    if (null != extraEntries) { // Put the alias masters at the top:
        //DEBUG
        //System.out.println(Arrays.toString(extraEntries.toArray()));
        for (EntityFeaturePojo alias : extraEntries) {
            SearchSuggestPojo sp = new SearchSuggestPojo();
            if (null != alias.getDimension()) {
                sp.setDimension(alias.getDimension().toString());
            } else {
                sp.setDimension("What");
            }
            sp.setValue(alias.getDisambiguatedName());
            sp.setType(alias.getType());
            if (bIncludeGeo) {
                sp.setGeotag(alias.getGeotag());
            }
            sp.setOntology_type(alias.getOntology_type());
            dimlist.addSearchSuggestPojo(sp);
        }
    } //TESTED (inc geo)

    if (null != docs) {
        for (SearchHit hit : docs) {
            SearchHitField shf = hit.field(EntityFeaturePojo.disambiguated_name_);
            if (null == shf) { // robustness check, sometimes if the harvester goes wrong this field might be missing
                continue;
            }
            String disname = (String) shf.value();
            String type = (String) hit.field(EntityFeaturePojo.type_).value();
            String dimension = (String) hit.field(EntityFeaturePojo.dimension_).value();
            SearchSuggestPojo sp = new SearchSuggestPojo();

            sp.setValue(disname);
            sp.setDimension(dimension);
            sp.setType(type);
            if (bIncludeGeo) {
                SearchHitField loc = hit.field(EntityFeaturePojo.geotag_);
                if (loc != null)
                    sp.setLocFromES((String) loc.value());
                SearchHitField ont = hit.field(EntityFeaturePojo.ontology_type_);
                if (ont != null)
                    sp.setOntology_type((String) ont.value());
            }
            if (bIncludeLinkdata) {
                SearchHitField linkdata = hit.field(EntityFeaturePojo.linkdata_);
                if (linkdata != null)
                    sp.setLinkdata(linkdata.values());
            }

            // More alias handling
            String index = null;
            if (null != aliasTable) {
                index = (String) hit.field(EntityFeaturePojo.index_).value();
                EntityFeaturePojo alias = aliasTable.getAliasMaster(index);
                if (null != alias) { // Found!
                    if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
                        continue;
                    } else if ((null != alias.getDisambiguatedName()) && (null != alias.getType())) {
                        // (these need to be present)

                        //DEBUG (perf critical)
                        //logger.debug("Alias! Replace " + index + " with " + alias.getIndex());

                        index = alias.getIndex();
                        disname = alias.getDisambiguatedName();
                        type = alias.getType();
                        if (null != alias.getDimension()) {
                            dimension = alias.getDimension().toString();
                        } else { // Guess from type
                            dimension = DimensionUtility.getDimensionByType(type).toString();
                        }
                        // Reset values:
                        sp.setValue(disname);
                        sp.setDimension(dimension);
                        sp.setType(type);
                    }
                }
                SearchSuggestPojo existing = aliasResults.get(index);
                if (null != existing) {

                    //DEBUG (perf critical)
                    //logger.debug("Alias! Remove duplicate " + index);

                    if ((null == existing.getGeotag()) && (null != sp.getGeotag())) {
                        // (if they're both set then sigh just ignore on a first-come-first-served basis)
                        existing.setGeotag(sp.getGeotag());
                        existing.setOntology_type(sp.getOntology_type());
                    } //TESTED
                    if (null != sp.getLinkdata()) { // (here we can just combine the linkdata)
                        if (null == existing.getLinkdata()) {
                            existing.setLinkdata(sp.getLinkdata());
                        } else {
                            existing.getLinkdata().addAll(sp.getLinkdata());
                        }
                    } //TESTED
                    continue; // (ie don't add this guy)
                } else { // add it
                    aliasResults.put(index, sp);
                }
            }
            //TESTED
            // end more alias handing                        

            dimlist.addSearchSuggestPojo(sp);
            // (only adds unique entries, ie handles multiple communities "ok" (only ok
            //  because it doesn't sum the doccounts across multiple communities, you'd probably
            //  want to use facets for that, but it doesn't seem worth it, especially since we're
            //  pretty short on field cache space)

            if (++nDocsAdded >= nDesiredSize) { // (can happen in the de-aliasing case)
                break;
            } //TESTED
        }
    }
    rp.setData(dimlist);
    rp.setResponse(new ResponseObject("Suggestions", true, term));

    if (nSysTime > (lastSuggestLog + 5000)) {
        lastSuggestLog = nSysTime;
        logMsg.setLength(0);
        logMsg.append("knowledge/searchSuggest query=").append(escapedterm);
        logMsg.append(" groups=").append(communityIdStrList);
        logMsg.append(" found=").append(docs.length);
        logMsg.append(" time=").append(System.currentTimeMillis() - nSysTime).append(" ms");
        logger.info(logMsg.toString());
    }
    return rp;
}

From source file:com.ikanow.infinit.e.api.knowledge.SearchHandler.java

License:Open Source License

public ResponsePojo getAssociationSuggestions(String userIdStr, String ent1, String verb, String ent2,
        String field, String communityIdStrList, boolean bWantNoAlias) {
    ResponsePojo rp = new ResponsePojo();
    try {/* w w  w. j ava  2 s .  c  om*/
        // Community ids, needed in a couple of places
        String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);

        // Initial alias handling:
        AliasLookupTable aliasTable = null;
        // Initial alias handling:         
        if (!bWantNoAlias) {
            AliasManager aliasManager = AliasManager.getAliasManager();
            if (null != aliasManager) {
                aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null,
                        userIdStr);
            }
        } //TESTED                              

        ElasticSearchManager esm = ElasticSearchManager.getIndex(assocIndex_);
        SearchRequestBuilder searchOptions = esm.getSearchOptions();
        BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
        boolean bExtraQueryTerms = false;
        String term = "";
        if (!ent1.equals("null")) {
            if (field.equals(AssociationFeaturePojo.entity1_))
                term = ent1;
            else {
                bExtraQueryTerms = true;
                EntityFeaturePojo alias = null;
                if (null != aliasTable) {
                    alias = aliasTable.getAliasMaster(ent1);
                }
                if (null != alias) { // Found!
                    boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity1_index_,
                            alias.getAlias().toArray()));
                } else {
                    boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity1_index_, ent1));
                } //TESTED
            }
        }
        if (!verb.equals("null")) {
            if (field.equals(AssociationFeaturePojo.verb_))
                term = verb;
            else {
                bExtraQueryTerms = true;
                boolQuery.must(QueryBuilders
                        .queryString(new StringBuffer("+").append(verb.replaceAll("\\s+", " +")).toString())
                        .defaultField(AssociationFeaturePojo.verb_));
            }
        }
        if (!ent2.equals("null")) {
            if (field.equals(AssociationFeaturePojo.entity2_))
                term = ent2;
            else {
                bExtraQueryTerms = true;
                EntityFeaturePojo alias = null;
                if (null != aliasTable) {
                    alias = aliasTable.getAliasMaster(ent2);
                }
                if (null != alias) { // Found!
                    boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity2_index_,
                            alias.getAlias().toArray()));
                } else {
                    boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity2_index_, ent2));
                }
            } //TESTED (cut and paste from entity1)
        }

        String escapedterm = null;
        StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30,
                new StringReader(ContentUtils.stripDiacritics(term)));
        CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
        StringBuffer sb = new StringBuffer();
        try {
            try {
                st.reset();
                while (st.incrementToken()) {
                    if (sb.length() > 0) {
                        sb.append(" +");
                    } else {
                        sb.append('+');
                    }
                    sb.append(luceneEncodeTerm(termAtt.toString()));
                }
            } finally {
                st.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
            sb.append('*');
        } //TESTED         

        escapedterm = sb.toString();

        // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them 
        List<EntityFeaturePojo> extraEntries = null;
        BoolQueryBuilder extraQueryTerms = null;
        if (field.startsWith("entity")) {
            String indexField = field.startsWith("entity1") ? "entity1_index" : "entity2_index";
            if (null != aliasTable) {
                extraEntries = checkAliasMasters(aliasTable, escapedterm);
            }
            if (null != extraEntries) {
                extraQueryTerms = QueryBuilders.boolQuery();
                int nExtraTerms = 0;
                Iterator<EntityFeaturePojo> aliasIt = extraEntries.iterator();
                while (aliasIt.hasNext()) {
                    EntityFeaturePojo alias = aliasIt.next();
                    nExtraTerms += alias.getAlias().size();

                    if (!bExtraQueryTerms && (nExtraTerms > 20)) { // If not filtering on event type we'll be more aggressive
                        break;
                    } //TESTED
                    if (bExtraQueryTerms && (nExtraTerms > 60)) { // If the number of terms gets too large bail anyway
                        break;
                    } //TESTED

                    extraQueryTerms.should(QueryBuilders.termsQuery(indexField, alias.getAlias().toArray()));
                    aliasIt.remove();

                } //end loop over entities 
            } //if found new aliases

        } //(if this is an entity lookup) TESTED - including breaking out because of # of terms 

        // (end initial alias handling)

        if (null == extraQueryTerms) {
            boolQuery.must(QueryBuilders.queryString(escapedterm).defaultField(field));
        } else {//(in this case combine the escaped term with the aliases
            extraQueryTerms.should(QueryBuilders.queryString(escapedterm).defaultField(field));
            boolQuery.must(extraQueryTerms);
        } //TESTED
        boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.communityId_, communityIdStrs));

        searchOptions.addSort(AssociationFeaturePojo.doccount_, SortOrder.DESC);

        // Work out which fields to return:
        //TODO (INF-1234) need to work out what to do with quotations and similar here (ie entityX without entityX_index) 
        String returnfield;
        boolean bReturningEntities = true;
        if (field.equals(AssociationFeaturePojo.entity1_)) {
            returnfield = AssociationFeaturePojo.entity1_index_;
            searchOptions.addFields(AssociationFeaturePojo.entity1_index_, AssociationFeaturePojo.doccount_);
        } else if (field.equals(AssociationFeaturePojo.entity2_)) {
            returnfield = AssociationFeaturePojo.entity2_index_;
            searchOptions.addFields(AssociationFeaturePojo.entity2_index_, AssociationFeaturePojo.doccount_);
        } else {
            bReturningEntities = false;
            returnfield = AssociationFeaturePojo.verb_;
            searchOptions.addFields(AssociationFeaturePojo.verb_, AssociationFeaturePojo.verb_category_,
                    AssociationFeaturePojo.doccount_);
        }

        int nNumSuggestionsToReturn = 20;
        if (bReturningEntities && (null != aliasTable)) {
            searchOptions.setSize(3 * nNumSuggestionsToReturn); // we're going to remove some duplicates so get more than we need
        } else { // normal case
            searchOptions.setSize(nNumSuggestionsToReturn);
        }

        SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
        SearchHit[] docs = rsp.getHits().getHits();

        //Currently this code takes the results and puts
        //them into a set so there are no duplicates
        //duplicates occur for example when you search for
        //obama you get obama/quotation/quote1 and obama/travel/spain
        //may want to work this differnt, or atleast sum up
        //frequency
        Set<String> suggestions = new HashSet<String>();

        for (SearchHit hit : docs) {
            SearchHitField retField = hit.field(returnfield); // (this can be null in theory/by mistake)
            if (null != retField) {
                String suggestion = (String) retField.value();
                if (bReturningEntities && (null != aliasTable)) {
                    // More alias handling
                    EntityFeaturePojo alias = aliasTable.getAliasMaster(suggestion);
                    if (null != alias) { // Found!
                        if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
                            continue;
                        } else {
                            // (these need to be present)
                            suggestion = alias.getIndex();
                        }
                    } //TESTED
                } else { // (old code, still valid for verbs or no aliases) 
                    if (returnfield.equals(AssociationFeaturePojo.verb_)
                            && hit.field(AssociationFeaturePojo.verb_category_) != null)
                    //for some reason verb_cat can be null!?!?! i think this is broken (ent1 facebook inc/company verb *)
                    {
                        String verbcat = (String) hit.field(AssociationFeaturePojo.verb_category_).value();
                        suggestion += " (" + verbcat + ")";
                        suggestions.add(verbcat);
                    }
                }
                suggestions.add(suggestion);

                if (suggestions.size() >= nNumSuggestionsToReturn) {
                    break;
                }

            } // (end return string valid)
        } //end loop over suggestions

        // Add any aliases that I couldn't explicity convert to query terms
        if ((null != extraEntries) && (suggestions.size() < nNumSuggestionsToReturn)) {
            for (EntityFeaturePojo alias : extraEntries) {
                suggestions.add(alias.getIndex());
                if (suggestions.size() >= nNumSuggestionsToReturn) {
                    break;
                }
            }
        } //(end add any remaining entries)
          //TESTED         

        String[] suggestionArray = new String[suggestions.size()];
        rp.setData(Arrays.asList(suggestions.toArray(suggestionArray)), (BasePojoApiMap<String>) null);

        String searchTerm = "";
        if (field.equals(AssociationFeaturePojo.entity1_))
            searchTerm = ent1;
        else if (field.equals(AssociationFeaturePojo.verb_))
            searchTerm = verb;
        else
            searchTerm = ent2;

        rp.setResponse(new ResponseObject("Association Suggestions", true, searchTerm));
    } catch (Exception ex) {
        ex.printStackTrace();
        rp.setResponse(new ResponseObject("Association Suggestions", false,
                "Response returned unsuccessfully: " + ex.getMessage()));
    }
    return rp;
}