Example usage for org.apache.lucene.analysis.charfilter HTMLStripCharFilter HTMLStripCharFilter

List of usage examples for org.apache.lucene.analysis.charfilter HTMLStripCharFilter HTMLStripCharFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.charfilter HTMLStripCharFilter HTMLStripCharFilter.

Prototype

public HTMLStripCharFilter(java.io.Reader in) 

Source Link

Document

Creates a new scanner

Usage

From source file:com.codeReading.core.opengrok.search.Results.java

License:Open Source License

private static String getTags(File basedir, String path, boolean compressed) {
    char[] content = new char[1024 * 8];
    try (HTMLStripCharFilter r = new HTMLStripCharFilter(getXrefReader(basedir, path, compressed))) {
        int len = r.read(content);
        return new String(content, 0, len);
    } catch (Exception e) {
        OpenGrokLogger.getLogger().log(Level.WARNING,
                "An error reading tags from " + basedir + path + (compressed ? ".gz" : ""), e);
    }/*from   ww w  . j  a  v  a  2s .  c  o m*/
    return "";
}

From source file:com.sismics.reader.core.dao.lucene.ReaderStandardAnalyzer.java

License:Apache License

@Override
protected Reader initReader(String fieldName, Reader reader) {
    if (fieldName.equals("title") || fieldName.equals("description")) {
        return new HTMLStripCharFilter(super.initReader(fieldName, reader));
    }//from w ww.  ja  va  2 s .com
    return super.initReader(fieldName, reader);
}

From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

    CharFilter cf = new PatternReplaceCharFilter(Pattern.compile("[*-!`{}~[]='<>:/;.&%|=+_]"), "", reader);

    cf = new HTMLStripCharFilter(cf);

    final StandardTokenizer analyzer = new StandardTokenizer(Version.LUCENE_41, cf);
    TokenStream tok = new StandardFilter(Version.LUCENE_41, analyzer);
    tok = new LowerCaseFilter(Version.LUCENE_41, tok);
    tok = new ASCIIFoldingFilter(tok);
    if (shingles) {
        tok = new ShingleFilter(tok, 5);
    }//from  ww  w  .  j  a  v a2s .  c  o  m
    return new TokenStreamComponents(analyzer, tok) {
        @Override
        protected void setReader(final Reader reader) throws IOException {
            super.setReader(reader);
        }
    };
}

From source file:it.cnr.isti.hpc.dexter.analysis.SpotAnalyzer.java

License:Apache License

@Override
protected Reader initReader(String fieldName, Reader reader) {
    CharFilter cf = new PatternReplaceCharFilter(Pattern.compile("^[ ]*the +(.*)"), "$1", reader);

    cf = new PatternReplaceCharFilter(Pattern.compile("[*!`{}~='<>:/%|=+_]"), " ", cf);

    cf = new PatternReplaceCharFilter(Pattern.compile("^[ ]*a +(.*)"), "$1", cf);
    cf = new PatternReplaceCharFilter(Pattern.compile("^(.*) \\(.*\\)$"), "$1", cf);

    cf = new PatternReplaceCharFilter(Pattern.compile("^(.*)#.*$"), "$1", cf);

    cf = new PatternReplaceCharFilter(Pattern.compile("[, ]*[sjSJ][rR][.]?"), "", cf);

    cf = new PatternReplaceCharFilter(Pattern.compile(" ([A-Z][.] ?)+"), " ", cf);
    cf = new PatternReplaceCharFilter(Pattern.compile("^([A-Z][.] ?)+ "), " ", cf);
    cf = new PatternReplaceCharFilter(Pattern.compile(" [A-Z][.]$"), " ", cf);

    cf = new HTMLStripCharFilter(reader);
    return cf;//from   www. j av a2 s .c o  m

}

From source file:mllab_lucene.StandardAnalyzerHtml.java

License:Apache License

@Override //INSERTED
protected Reader initReader(String fieldName, Reader reader) {
    return new HTMLStripCharFilter(reader);
}

From source file:org.apache.solr.handler.dataimport.HTMLStripTransformer.java

License:Apache License

private Object stripHTML(String value, String column) {
    StringBuilder out = new StringBuilder();
    StringReader strReader = new StringReader(value);
    try {// w  w w. j a  v a2  s .c o m
        HTMLStripCharFilter html = new HTMLStripCharFilter(
                strReader.markSupported() ? strReader : new BufferedReader(strReader));
        char[] cbuf = new char[1024 * 10];
        while (true) {
            int count = html.read(cbuf);
            if (count == -1)
                break; // end of stream mark is -1
            if (count > 0)
                out.append(cbuf, 0, count);
        }
        html.close();
    } catch (IOException e) {
        throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                "Failed stripping HTML for column: " + column, e);
    }
    return out.toString();
}

From source file:org.apache.solr.update.processor.HTMLStripFieldUpdateProcessorFactory.java

License:Apache License

@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
        UpdateRequestProcessor next) {// w w  w. j ava2 s .c  om
    return new FieldValueMutatingUpdateProcessor(getSelector(), next) {
        @Override
        protected Object mutateValue(final Object src) {
            if (src instanceof CharSequence) {
                CharSequence s = (CharSequence) src;
                StringWriter result = new StringWriter(s.length());
                Reader in = null;
                try {
                    in = new HTMLStripCharFilter(new StringReader(s.toString()));
                    IOUtils.copy(in, result);
                    return result.toString();
                } catch (IOException e) {
                    // we tried and failed
                    return s;
                } finally {
                    IOUtils.closeQuietly(in);
                }

            }
            return src;
        }
    };
}

From source file:org.craftercms.search.service.impl.HtmlStrippingConverter.java

License:Open Source License

@Override
public Object convert(String name, String value) {
    StringReader reader = new StringReader(value);
    HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(reader);
    char[] buffer = new char[BUFFER_SIZE];
    StringBuilder strippedValue = new StringBuilder();

    try {//from  w  ww .  j av a  2s.  co  m
        int charsRead;
        do {
            charsRead = htmlStripper.read(buffer);
            if (charsRead > 0) {
                strippedValue.append(buffer, 0, charsRead);
            }
        } while (charsRead >= 0);
    } catch (IOException e) {
        throw new SolrDocumentBuildException("Error while performing HTML stripping for field '" + name + "'",
                e);
    }

    return strippedValue.toString();
}

From source file:org.craftercms.search.service.impl.SolrDocumentBuilder.java

License:Open Source License

protected String stripHtml(String element, String value) throws SolrDocumentBuildException {
    StringReader reader = new StringReader(value);
    //HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(CharReader.get(reader));
    HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(reader);
    char[] buffer = new char[1024 * 10];
    StringBuilder strippedValue = new StringBuilder();

    try {/*from   w ww . j a  va 2 s.c  om*/
        int charsRead;
        do {
            charsRead = htmlStripper.read(buffer);
            if (charsRead > 0) {
                strippedValue.append(buffer, 0, charsRead);
            }
        } while (charsRead >= 0);
    } catch (IOException e) {
        throw new SolrDocumentBuildException("Failed to strip the HTML from field '" + element + "'", e);
    }

    return strippedValue.toString();
}

From source file:org.emonocot.persistence.dao.hibernate.SearchableDaoImpl.java

License:Open Source License

private String filter(String value) {
    StringBuilder out = new StringBuilder();
    StringReader strReader = new StringReader(value);
    try {//w w w  .jav  a  2 s  .  co  m
        HTMLStripCharFilter html = new HTMLStripCharFilter(new BufferedReader(strReader));
        char[] cbuf = new char[1024 * 10];
        while (true) {
            int count = html.read(cbuf);
            if (count == -1)
                break; // end of stream mark is -1
            if (count > 0)
                out.append(cbuf, 0, count);
        }
        html.close();
    } catch (IOException e) {
        throw new RuntimeException("Failed stripping HTML for value: " + value, e);
    }
    return out.toString();
}