Example usage for org.apache.solr.analysis TokenizerChain TokenizerChain

List of usage examples for org.apache.solr.analysis TokenizerChain TokenizerChain

Introduction

In this page you can find the example usage for org.apache.solr.analysis TokenizerChain TokenizerChain.

Prototype

public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer,
        TokenFilterFactory[] filters) 

Source Link

Document

Creates a new TokenizerChain.

Usage

From source file:com.sindicetech.siren.solr.schema.AnalyzerConfigReader.java

License:Open Source License

/**
 * Read an analyzer definition and instantiate an {@link Analyzer} object.
 *
 * <p> Code taken from {@link org.apache.solr.schema.FieldTypePluginLoader#readAnalyzer(org.w3c.dom.Node)}}
 *
 * @param node An analyzer node from the config file
 * @return An analyzer// w  ww . j a  v  a2  s  .c  o  m
 * @throws XPathExpressionException If an XPath expression cannot be evaluated
 */
protected static Analyzer readAnalyzer(final Node node, final SolrResourceLoader loader,
        final Version luceneMatchVersion) throws XPathExpressionException {
    if (node == null)
        return null;
    final NamedNodeMap attrs = node.getAttributes();

    final String analyzerName = DOMUtil.getAttr(attrs, "class");

    // check for all of these up front, so we can error if used in
    // conjunction with an explicit analyzer class.
    final XPath xpath = XPathFactory.newInstance().newXPath();
    final NodeList charFilterNodes = (NodeList) xpath.evaluate("./charFilter", node, XPathConstants.NODESET);
    final NodeList tokenizerNodes = (NodeList) xpath.evaluate("./tokenizer", node, XPathConstants.NODESET);
    final NodeList tokenFilterNodes = (NodeList) xpath.evaluate("./filter", node, XPathConstants.NODESET);

    if (analyzerName != null) {

        // explicitly check for child analysis factories instead of
        // just any child nodes, because the user might have their
        // own custom nodes (ie: <description> or something like that)
        if (0 != charFilterNodes.getLength() || 0 != tokenizerNodes.getLength()
                || 0 != tokenFilterNodes.getLength()) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                    "Configuration Error: Analyzer class='" + analyzerName
                            + "' can not be combined with nested analysis factories");
        }

        try {
            // No need to be core-aware as Analyzers are not in the core-aware list
            final Class<? extends Analyzer> clazz = loader.findClass(analyzerName, Analyzer.class);

            try {
                // first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore)
                final Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
                final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
                final Version matchVersion = (matchVersionStr == null) ? luceneMatchVersion
                        : Config.parseLuceneVersionString(matchVersionStr);
                if (matchVersion == null) {
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                            "Configuration Error: Analyzer '" + clazz.getName()
                                    + "' needs a 'luceneMatchVersion' parameter");
                }
                return cnstr.newInstance(matchVersion);
            } catch (final NoSuchMethodException nsme) {
                // otherwise use default ctor
                return clazz.newInstance();
            }
        } catch (final Exception e) {
            logger.error("Cannot load analyzer: " + analyzerName, e);
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                    "Cannot load analyzer: " + analyzerName, e);
        }
    }

    // Load the CharFilters
    // --------------------------------------------------------------------------------
    final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
    final AbstractPluginLoader<CharFilterFactory> charFilterLoader = new AbstractPluginLoader<CharFilterFactory>(
            "[analyzerConfig] analyzer/charFilter", CharFilterFactory.class, false, false) {

        @Override
        protected CharFilterFactory create(final SolrResourceLoader loader, final String name,
                final String className, final Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion,
                    CharFilterFactory.class.getSimpleName(), luceneMatchVersion).toString());
            CharFilterFactory factory = loader.newInstance(className, CharFilterFactory.class,
                    getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(final CharFilterFactory plugin, final Node node) throws Exception {
            if (plugin != null) {
                charFilters.add(plugin);
            }
        }

        @Override
        protected CharFilterFactory register(final String name, final CharFilterFactory plugin) {
            return null; // used for map registration
        }

    };

    charFilterLoader.load(loader, charFilterNodes);

    // Load the Tokenizer
    // Although an analyzer only allows a single Tokenizer, we load a list to make sure
    // the configuration is ok
    // --------------------------------------------------------------------------------
    final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
    final AbstractPluginLoader<TokenizerFactory> tokenizerLoader = new AbstractPluginLoader<TokenizerFactory>(
            "[analyzerConfig] analyzer/tokenizer", TokenizerFactory.class, false, false) {

        @Override
        protected TokenizerFactory create(final SolrResourceLoader loader, final String name,
                final String className, final Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion,
                    TokenizerFactory.class.getSimpleName(), luceneMatchVersion).toString());
            TokenizerFactory factory = loader.newInstance(className, TokenizerFactory.class,
                    getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(final TokenizerFactory plugin, final Node node) throws Exception {
            if (!tokenizers.isEmpty()) {
                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                        "Multiple tokenizers defined for: " + node);
            }
            tokenizers.add(plugin);
        }

        @Override
        protected TokenizerFactory register(final String name, final TokenizerFactory plugin) {
            return null; // used for map registration
        }
    };

    tokenizerLoader.load(loader, tokenizerNodes);

    // Make sure something was loaded
    if (tokenizers.isEmpty()) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "analyzer without class or tokenizer & filter list");
    }

    // Load the Filters
    // --------------------------------------------------------------------------------
    final ArrayList<TokenFilterFactory> filters = new ArrayList<TokenFilterFactory>();
    final AbstractPluginLoader<TokenFilterFactory> filterLoader = new AbstractPluginLoader<TokenFilterFactory>(
            "[analyzerConfig] analyzer/filter", TokenFilterFactory.class, false, false) {

        @Override
        protected TokenFilterFactory create(final SolrResourceLoader loader, final String name,
                final String className, final Node node) throws Exception {
            final Map<String, String> params = DOMUtil.toMap(node.getAttributes());
            String configuredVersion = params.remove(LUCENE_MATCH_VERSION_PARAM);
            params.put(LUCENE_MATCH_VERSION_PARAM, parseConfiguredVersion(configuredVersion,
                    TokenFilterFactory.class.getSimpleName(), luceneMatchVersion).toString());
            TokenFilterFactory factory = loader.newInstance(className, TokenFilterFactory.class,
                    getDefaultPackages(), new Class[] { Map.class }, new Object[] { params });
            factory.setExplicitLuceneMatchVersion(null != configuredVersion);
            return factory;
        }

        @Override
        protected void init(final TokenFilterFactory plugin, final Node node) throws Exception {
            if (plugin != null) {
                filters.add(plugin);
            }
        }

        @Override
        protected TokenFilterFactory register(final String name, final TokenFilterFactory plugin)
                throws Exception {
            return null; // used for map registration
        }
    };
    filterLoader.load(loader, tokenFilterNodes);

    return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]), tokenizers.get(0),
            filters.toArray(new TokenFilterFactory[filters.size()]));
}

From source file:com.sindicetech.siren.solr.schema.ConciseJsonField.java

License:Open Source License

/**
 * Append the mandatory SIREn filters for the concise model, i.e.,
 * {@link com.sindicetech.siren.solr.analysis.DatatypeAnalyzerFilterFactory},
 * {@link com.sindicetech.siren.solr.analysis.PathEncodingFilterFactory},
 * {@link com.sindicetech.siren.solr.analysis.PositionAttributeFilterFactory} and
 * {@link com.sindicetech.siren.solr.analysis.SirenPayloadFilterFactory}, to the tokenizer chain.
 *
 * @see ExtendedJsonField#appendSirenFilters(org.apache.lucene.analysis.Analyzer, java.util.Map)
 *///from www.j av a  2 s.co m
@Override
protected Analyzer appendSirenFilters(final Analyzer analyzer, final Map<String, Datatype> datatypes) {
    if (!(analyzer instanceof TokenizerChain)) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "Invalid index analyzer '" + analyzer.getClass() + "' received");
    }

    final TokenizerChain chain = (TokenizerChain) analyzer;
    // copy the existing list of token filters
    final TokenFilterFactory[] old = chain.getTokenFilterFactories();
    final TokenFilterFactory[] filterFactories = new TokenFilterFactory[old.length + 4];
    System.arraycopy(old, 0, filterFactories, 0, old.length);
    // append the datatype analyzer filter factory
    final DatatypeAnalyzerFilterFactory datatypeFactory = new DatatypeAnalyzerFilterFactory(
            new HashMap<String, String>());
    datatypeFactory.register(datatypes);
    filterFactories[old.length] = datatypeFactory;
    // append the path encoding filter factory
    filterFactories[old.length + 1] = new PathEncodingFilterFactory(new HashMap<String, String>());
    // append the position attribute filter factory
    filterFactories[old.length + 2] = new PositionAttributeFilterFactory(new HashMap<String, String>());
    // append the siren payload filter factory
    filterFactories[old.length + 3] = new SirenPayloadFilterFactory(new HashMap<String, String>());
    // create a new tokenizer chain with the updated list of filter factories
    return new TokenizerChain(chain.getCharFilterFactories(), chain.getTokenizerFactory(), filterFactories);
}

From source file:com.sindicetech.siren.solr.schema.ExtendedJsonField.java

License:Open Source License

@Override
protected void init(final IndexSchema schema, final Map<String, String> args) {
    // first call TextField.init to set omitTermFreqAndPositions to false
    super.init(schema, args);
    this.checkFieldTypeProperties();
    // initialise specific SIREn's properties
    this.datatypeAnalyzerConfigPath = args.remove(DATATYPECONFIG_KEY);

    if (datatypeAnalyzerConfigPath == null) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "ExtendedJsonField types require a '" + DATATYPECONFIG_KEY + "' parameter: " + this.typeName);
    }/*from ww  w  .j  av a  2s  .com*/

    // set the posting format
    args.put("postingsFormat", Siren10AForPostingsFormat.NAME);

    this.luceneDefaultVersion = schema.getDefaultLuceneMatchVersion();

    // instantiate the index analyzer associated to the field
    Analyzer indexAnalyzer = new TokenizerChain(new CharFilterFactory[0], this.getTokenizerFactory(args),
            new TokenFilterFactory[0]);
    indexAnalyzer = this.appendSirenFilters(indexAnalyzer, this.getDatatypes());
    this.setIndexAnalyzer(indexAnalyzer);

    super.init(schema, args);
}

From source file:com.sindicetech.siren.solr.schema.ExtendedJsonField.java

License:Open Source License

/**
 * Append the mandatory SIREn filters, i.e.,
 * {@link DatatypeAnalyzerFilterFactory},
 * {@link PositionAttributeFilterFactory} and
 * {@link SirenPayloadFilterFactory}, to the tokenizer chain.
 * <br/>/*w  w w  . j  a v  a2s . co  m*/
 * The first time this is called, it will create a
 * {@link com.sindicetech.siren.solr.analysis.DatatypeAnalyzerFilterFactory} with no datatype registered. The datatypes
 * will be loaded and registered later, when {@link #inform(org.apache.lucene.analysis.util.ResourceLoader)} is
 * called.
 * <br/>
 * This is necessary to avoid having to call {@link org.apache.solr.schema.IndexSchema#refreshAnalyzers()}.
 * The {@link org.apache.solr.schema.IndexSchema} will have a reference to the SIREn field's analyzer, and
 * to the {@link com.sindicetech.siren.solr.analysis.DatatypeAnalyzerFilterFactory}. When the datatypes will be loaded,
 * we will access this reference, and register the datatypes.
 */
protected Analyzer appendSirenFilters(final Analyzer analyzer, final Map<String, Datatype> datatypes) {
    if (!(analyzer instanceof TokenizerChain)) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "Invalid index analyzer '" + analyzer.getClass() + "' received");
    }

    final TokenizerChain chain = (TokenizerChain) analyzer;
    // copy the existing list of token filters
    final TokenFilterFactory[] old = chain.getTokenFilterFactories();
    final TokenFilterFactory[] filterFactories = new TokenFilterFactory[old.length + 3];
    System.arraycopy(old, 0, filterFactories, 0, old.length);
    // append the datatype analyzer filter factory
    final DatatypeAnalyzerFilterFactory datatypeFactory = new DatatypeAnalyzerFilterFactory(
            new HashMap<String, String>());
    datatypeFactory.register(datatypes);
    filterFactories[old.length] = datatypeFactory;
    // append the position attribute filter factory
    filterFactories[old.length + 1] = new PositionAttributeFilterFactory(new HashMap<String, String>());
    // append the siren payload filter factory
    filterFactories[old.length + 2] = new SirenPayloadFilterFactory(new HashMap<String, String>());
    // create a new tokenizer chain with the updated list of filter factories
    return new TokenizerChain(chain.getCharFilterFactories(), chain.getTokenizerFactory(), filterFactories);
}

From source file:org.hibernate.search.impl.SolrAnalyzerBuilder.java

License:Open Source License

/**
 * Builds a Lucene <code>Analyzer</code> from the specified <code>AnalyzerDef</code> annotation.
 *
 * @param analyzerDef The <code>AnalyzerDef</code> annotation as found in the annotated domain class.
 * @param luceneMatchVersion The lucene version (required since Lucene 3.x)
 *
 * @return a Lucene <code>Analyzer</code>
 *//*  w  w w  .  j  a  v a2 s  .com*/
public static Analyzer buildAnalyzer(AnalyzerDef analyzerDef, Version luceneMatchVersion) {
    ResourceLoader defaultResourceLoader = new HibernateSearchResourceLoader();
    TokenizerDef token = analyzerDef.tokenizer();
    TokenizerFactory tokenFactory = instanceFromClass(TokenizerFactory.class, token.factory(),
            "Tokenizer factory");
    final Map<String, String> tokenMapsOfParameters = getMapOfParameters(token.params(), luceneMatchVersion);
    tokenFactory.init(tokenMapsOfParameters);
    injectResourceLoader(tokenFactory, defaultResourceLoader, tokenMapsOfParameters);

    final int length = analyzerDef.filters().length;
    final int charLength = analyzerDef.charFilters().length;
    TokenFilterFactory[] filters = new TokenFilterFactory[length];
    CharFilterFactory[] charFilters = new CharFilterFactory[charLength];
    for (int index = 0; index < length; index++) {
        TokenFilterDef filterDef = analyzerDef.filters()[index];
        filters[index] = instanceFromClass(TokenFilterFactory.class, filterDef.factory(),
                "Token filter factory");
        final Map<String, String> mapOfParameters = getMapOfParameters(filterDef.params(), luceneMatchVersion);
        filters[index].init(mapOfParameters);
        injectResourceLoader(filters[index], defaultResourceLoader, mapOfParameters);
    }
    for (int index = 0; index < charFilters.length; index++) {
        CharFilterDef charFilterDef = analyzerDef.charFilters()[index];
        charFilters[index] = instanceFromClass(CharFilterFactory.class, charFilterDef.factory(),
                "Character filter factory");
        final Map<String, String> mapOfParameters = getMapOfParameters(charFilterDef.params(),
                luceneMatchVersion);
        charFilters[index].init(mapOfParameters);
        injectResourceLoader(charFilters[index], defaultResourceLoader, mapOfParameters);
    }
    return new TokenizerChain(charFilters, tokenFactory, filters);
}

From source file:org.sindice.siren.solr.schema.AnalyzerConfigReader.java

License:Open Source License

/**
 * Read an analyzer definition and instantiate an {@link Analyzer} object.
 *
 * <p> Code taken from {@link IndexSchema#readAnalyzer()}
 *
 * @param node An analyzer node from the config file
 * @return An analyzer/*ww  w. j a  v a2 s.c  om*/
 * @throws XPathExpressionException If an XPath expression cannot be evaluated
 */
protected static Analyzer readAnalyzer(final Node node, final SolrResourceLoader loader,
        final Version luceneMatchVersion) throws XPathExpressionException {
    if (node == null)
        return null;
    final NamedNodeMap attrs = node.getAttributes();
    final String analyzerName = DOMUtil.getAttr(attrs, "class");
    if (analyzerName != null) {
        // No need to be core-aware as Analyzers are not in the core-aware list
        final Class<? extends Analyzer> clazz = loader.findClass(analyzerName).asSubclass(Analyzer.class);
        try {
            try {
                // first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore)
                final Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
                final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
                final Version matchVersion = (matchVersionStr == null) ? luceneMatchVersion
                        : Config.parseLuceneVersionString(matchVersionStr);
                if (matchVersion == null) {
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                            "Configuration Error: Analyzer '" + clazz.getName()
                                    + "' needs a 'luceneMatchVersion' parameter");
                }
                return cnstr.newInstance(matchVersion);
            } catch (final NoSuchMethodException nsme) {
                // otherwise use default ctor
                return clazz.newInstance();
            }
        } catch (final Exception e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                    "Cannot load analyzer: " + analyzerName);
        }
    }

    final XPath xpath = XPathFactory.newInstance().newXPath();

    // Load the CharFilters
    // --------------------------------------------------------------------------------
    final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
    final AbstractPluginLoader<CharFilterFactory> charFilterLoader = new AbstractPluginLoader<CharFilterFactory>(
            "[analyzerConfig] analyzer/charFilter", false, false) {
        @Override
        protected void init(final CharFilterFactory plugin, final Node node) throws Exception {
            if (plugin != null) {
                final Map<String, String> params = DOMUtil.toMapExcept(node.getAttributes(), "class");
                // copy the luceneMatchVersion from config, if not set
                if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
                    params.put(LUCENE_MATCH_VERSION_PARAM, luceneMatchVersion.toString());
                plugin.init(params);
                charFilters.add(plugin);
            }
        }

        @Override
        protected CharFilterFactory register(final String name, final CharFilterFactory plugin)
                throws Exception {
            return null; // used for map registration
        }
    };
    charFilterLoader.load(loader, (NodeList) xpath.evaluate("./charFilter", node, XPathConstants.NODESET));

    // Load the Tokenizer
    // Although an analyzer only allows a single Tokenizer, we load a list to make sure
    // the configuration is ok
    // --------------------------------------------------------------------------------
    final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
    final AbstractPluginLoader<TokenizerFactory> tokenizerLoader = new AbstractPluginLoader<TokenizerFactory>(
            "[analyzerConfig] analyzer/tokenizer", false, false) {
        @Override
        protected void init(final TokenizerFactory plugin, final Node node) throws Exception {
            if (!tokenizers.isEmpty()) {
                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                        "Multiple tokenizers defined for: " + node);
            }
            final Map<String, String> params = DOMUtil.toMapExcept(node.getAttributes(), "class");
            // copy the luceneMatchVersion from config, if not set
            if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
                params.put(LUCENE_MATCH_VERSION_PARAM, luceneMatchVersion.toString());
            plugin.init(params);
            tokenizers.add(plugin);
        }

        @Override
        protected TokenizerFactory register(final String name, final TokenizerFactory plugin) throws Exception {
            return null; // used for map registration
        }
    };
    tokenizerLoader.load(loader, (NodeList) xpath.evaluate("./tokenizer", node, XPathConstants.NODESET));

    // Make sure something was loaded
    if (tokenizers.isEmpty()) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "analyzer without class or tokenizer & filter list");
    }

    // Load the Filters
    // --------------------------------------------------------------------------------
    final ArrayList<TokenFilterFactory> filters = new ArrayList<TokenFilterFactory>();
    final AbstractPluginLoader<TokenFilterFactory> filterLoader = new AbstractPluginLoader<TokenFilterFactory>(
            "[analyzerConfig] analyzer/filter", false, false) {
        @Override
        protected void init(final TokenFilterFactory plugin, final Node node) throws Exception {
            if (plugin != null) {
                final Map<String, String> params = DOMUtil.toMapExcept(node.getAttributes(), "class");
                // copy the luceneMatchVersion from config, if not set
                if (!params.containsKey(LUCENE_MATCH_VERSION_PARAM))
                    params.put(LUCENE_MATCH_VERSION_PARAM, luceneMatchVersion.toString());
                plugin.init(params);
                filters.add(plugin);
            }
        }

        @Override
        protected TokenFilterFactory register(final String name, final TokenFilterFactory plugin)
                throws Exception {
            return null; // used for map registration
        }
    };
    filterLoader.load(loader, (NodeList) xpath.evaluate("./filter", node, XPathConstants.NODESET));

    return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]), tokenizers.get(0),
            filters.toArray(new TokenFilterFactory[filters.size()]));
}

From source file:solr2155.solr.schema.GeoHashField.java

License:Apache License

@Override
protected void init(IndexSchema schema, Map<String, String> args) {
    String len = args.remove("length");
    gridReferenceSystem = new GridNode.GridReferenceSystem(
            len != null ? Integer.parseInt(len) : DEFAULT_LENGTH);

    CharFilterFactory[] filterFactories = new CharFilterFactory[0];
    TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
    analyzer = new TokenizerChain(filterFactories, new BaseTokenizerFactory() {
        public Tokenizer create(Reader input) {
            return new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, Integer.MAX_VALUE);
        }//from  w w w.j  ava 2s .  com
    }, tokenFilterFactories);
    //(leave default queryAnalyzer -- single token)

    //properties |= OMIT_NORMS;  //can't do this since properties isn't public/protected
}