Example usage for org.apache.lucene.analysis.ar ArabicAnalyzer ArabicAnalyzer

List of usage examples for org.apache.lucene.analysis.ar ArabicAnalyzer ArabicAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.ar ArabicAnalyzer ArabicAnalyzer.

Prototype

public ArabicAnalyzer(CharArraySet stopwords) 

Source Link

Document

Builds an analyzer with the given stop words

Usage

From source file:de.berlinbuzzwords.AnalyzerExampleTest.java

License:Apache License

@Test
public void testArabicAnalysis() throws IOException {
    String arabic = "?  ? ?  ? ??  ?  ??  ?.";
    printer.printTerms(new ArabicAnalyzer(Version.LUCENE_43), arabic);
}

From source file:fr.lipn.yasemir.Yasemir.java

License:Open Source License

/**
 * Initialisation method to be called before every action
 * @param configFile/*from   w ww.  j  av a  2 s  . c  o m*/
 */
public static void init(String configFile) {
    System.err.println("Reading config file...");
    ConfigurationHandler.init(configFile);

    //setting paths
    YASEMIR_HOME = ConfigurationHandler.YASEMIR_HOME;
    INDEX_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.INDEXDIR;
    TERM_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.TERMIDXDIR;
    //TERM_DIR=INDEX_DIR+System.getProperty("file.separator")+ConfigurationHandler.TERMIDXDIR;
    COLLECTION_DIR = ConfigurationHandler.CORPUSDIR;
    idField = ConfigurationHandler.DOCIDFIELD;
    ID_ASATTR = ConfigurationHandler.IDFIELD_ASATTR;
    DOC_DELIM = ConfigurationHandler.DOC_DELIM;
    COLLECTION_LANG = ConfigurationHandler.CORPUSLANG;

    if (COLLECTION_LANG.equals("fr"))
        analyzer = new FrenchAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("it"))
        analyzer = new ItalianAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("es"))
        analyzer = new SpanishAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("de"))
        analyzer = new GermanAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("pt"))
        analyzer = new PortugueseAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("ca"))
        analyzer = new CatalanAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("nl"))
        analyzer = new DutchAnalyzer(Version.LUCENE_44);
    else if (COLLECTION_LANG.equals("ar"))
        analyzer = new ArabicAnalyzer(Version.LUCENE_44);
    else
        analyzer = new EnglishAnalyzer(Version.LUCENE_44);

    //setting search mode
    String sm = ConfigurationHandler.SEARCH_MODE;
    if (sm != null) {
        if (sm.equalsIgnoreCase("semantic"))
            MODE = SEMANTIC;
        else if (sm.equalsIgnoreCase("hybrid"))
            MODE = HYBRID;
        else
            MODE = CLASSIC;
    }

    //setting concept similarity measure
    String smm = ConfigurationHandler.SIM_MEASURE;
    if (smm != null) {
        if (smm.equalsIgnoreCase("pg1"))
            SIM_MEASURE = ConceptSimilarity.PROXYGENEA1;
        else if (smm.equalsIgnoreCase("pg2"))
            SIM_MEASURE = ConceptSimilarity.PROXYGENEA2;
        else if (smm.equalsIgnoreCase("pg3"))
            SIM_MEASURE = ConceptSimilarity.PROXYGENEA3;
        else
            SIM_MEASURE = ConceptSimilarity.WU;
    }

    //setting concept weights
    String cw = ConfigurationHandler.CONCEPTWEIGHT;
    if (cw != null) {
        if (cw.equalsIgnoreCase("fixed"))
            CONCEPT_WEIGHTS = ClassWeightHandler.FIXED;
        else if (cw.equalsIgnoreCase("idf"))
            CONCEPT_WEIGHTS = ClassWeightHandler.IDF;
        else if (cw.equalsIgnoreCase("prob"))
            CONCEPT_WEIGHTS = ClassWeightHandler.PROB;
        else if (cw.equalsIgnoreCase("gauss"))
            CONCEPT_WEIGHTS = ClassWeightHandler.GAUSSPROB;
    }

    //setting annotator
    ANNOTATOR = ConfigurationHandler.ANNOTENGINE;
    annotator = new SentenceBasedAnnotator(TERM_DIR);
    //annotator=new KNNAnnotator(TERM_DIR); //TODO: not finished (select annotator depending on configuration file)
    try {
        Class<?> cls = Class.forName(ANNOTATOR);
        Constructor<?> constructor = cls.getConstructor(String.class);
        annotator = (SemanticAnnotator) constructor.newInstance(TERM_DIR);
        //Object instance = constructor.newInstance("stringparam");
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println(
                "[YaSemIR]: failed to load the specified annotator, falling back to IndexBasedAnnotator");
        annotator = annotator = new SentenceBasedAnnotator(TERM_DIR);
    }
    //setting ngrams enabled or not
    CKPD_ENABLED = ConfigurationHandler.NGRAMS_ENABLED;

    //setting semantic fields
    semBalises = new HashSet<String>();
    semBalises.addAll(ConfigurationHandler.getSemanticFields());

    //setting classic fields
    clsBalises = new HashSet<String>();
    clsBalises.addAll(ConfigurationHandler.getClassicFields());

    //setting score type
    SCORE = ConfigurationHandler.SCORE;

    //setting ontologies and terminologies
    System.err.println("[YaSemIR]: Loading Knowledge Battery...");

    HashMap<String, String> ontoSKOSconf = ConfigurationHandler.getOntologySKOSMap();
    HashMap<String, String> ontoRootconf = ConfigurationHandler.getOntologyRootMap();

    for (String ontoLoc : ontoSKOSconf.keySet()) {
        String ontoRoot = ontoRootconf.get(ontoLoc);
        Ontology o = null;
        if (ontoRoot.trim().isEmpty())
            o = new Ontology(ontoLoc);
        else
            o = new Ontology(ontoLoc, ontoRoot);
        System.err.println("[YaSemIR]: loaded ontology: " + o.getBaseAddr() + " at " + ontoLoc);
        String termPath = ontoSKOSconf.get(ontoLoc);
        SKOSTerminology t = null;
        if (!termPath.trim().isEmpty()) {
            System.err.println("[YaSemIR]: loading terminology from " + termPath);
            t = new SKOSTerminology(o.getOntologyID(), termPath);
        } else {
            System.err.println("[YaSemIR]: no terminology provided: generating trivial terminology from "
                    + o.getBaseAddr() + "...");
            t = o.generateTerminology();
        }
        System.err.println("[YaSemIR]: loaded terminology: " + t.getTerminologyID());
        KnowledgeBattery.addOntology(o, t);

    }
    if (INDEXING_MODE)
        KnowledgeBattery.createTermIndex();
    System.err.println("[YaSemIR]: Done.");

}

From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java

License:Open Source License

public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) {
    this.configuration = configuration;

    languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true));
    languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("en", new AnalyzerWrapper(
            new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true));
    languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true));
}

From source file:org.lucenerevolution.AnalyzerExampleTest.java

License:Apache License

@Test
public void testArabicAnalysis() throws IOException {
    String arabic = "?  ? ?  ? ??  ?  ??  ?.";
    printer.printTerms(new ArabicAnalyzer(Version.LUCENE_42), arabic);
}

From source file:org.omegat.tokenizer.LuceneArabicTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed,
        final boolean stopWordsAllowed) {
    if (stemsAllowed) {
        ArabicAnalyzer analyzer = stopWordsAllowed ? new ArabicAnalyzer(getBehavior())
                : new ArabicAnalyzer(getBehavior(), new String[] {});
        return analyzer.tokenStream("", new StringReader(strOrig));
    } else {//  w  w w. ja v  a  2  s. c om
        return new StandardTokenizer(getBehavior(), new StringReader(strOrig));
    }
}

From source file:perLucene.Server.java

License:Open Source License

private static void initAnalyzers() {

    ha = new HashMap<String, Analyzer>();

    ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41));
    ha.put("el", new GreekAnalyzer(Version.LUCENE_41));
    ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41));
    ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41));
    ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41));
    ha.put("cz", new CzechAnalyzer(Version.LUCENE_41));
    ha.put("da", new DanishAnalyzer(Version.LUCENE_41));
    ha.put("de", new GermanAnalyzer(Version.LUCENE_41));
    ha.put("en", new EnglishAnalyzer(Version.LUCENE_41));
    ha.put("es", new SpanishAnalyzer(Version.LUCENE_41));
    ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41));
    ha.put("fa", new PersianAnalyzer(Version.LUCENE_41));
    ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41));
    ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41));
    ha.put("ga", new IrishAnalyzer(Version.LUCENE_41));
    ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41));
    ha.put("hi", new HindiAnalyzer(Version.LUCENE_41));
    ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41));
    ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41));
    ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41));
    ha.put("it", new ItalianAnalyzer(Version.LUCENE_41));
    ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41));
    ha.put("nl", new DutchAnalyzer(Version.LUCENE_41));
    ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41));
    ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41));
    ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41));
    ha.put("ru", new RussianAnalyzer(Version.LUCENE_41));
    ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41));
    ha.put("th", new ThaiAnalyzer(Version.LUCENE_41));
    ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41));
    ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41));

}