List of usage examples for org.apache.lucene.analysis.charfilter HTMLStripCharFilter HTMLStripCharFilter
public HTMLStripCharFilter(java.io.Reader in)
From source file:com.codeReading.core.opengrok.search.Results.java
License:Open Source License
private static String getTags(File basedir, String path, boolean compressed) { char[] content = new char[1024 * 8]; try (HTMLStripCharFilter r = new HTMLStripCharFilter(getXrefReader(basedir, path, compressed))) { int len = r.read(content); return new String(content, 0, len); } catch (Exception e) { OpenGrokLogger.getLogger().log(Level.WARNING, "An error reading tags from " + basedir + path + (compressed ? ".gz" : ""), e); }/*from ww w . j a v a 2s . c o m*/ return ""; }
From source file:com.sismics.reader.core.dao.lucene.ReaderStandardAnalyzer.java
License:Apache License
@Override protected Reader initReader(String fieldName, Reader reader) { if (fieldName.equals("title") || fieldName.equals("description")) { return new HTMLStripCharFilter(super.initReader(fieldName, reader)); }//from w ww. ja va 2 s .com return super.initReader(fieldName, reader); }
From source file:it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { CharFilter cf = new PatternReplaceCharFilter(Pattern.compile("[*-!`{}~[]='<>:/;.&%|=+_]"), "", reader); cf = new HTMLStripCharFilter(cf); final StandardTokenizer analyzer = new StandardTokenizer(Version.LUCENE_41, cf); TokenStream tok = new StandardFilter(Version.LUCENE_41, analyzer); tok = new LowerCaseFilter(Version.LUCENE_41, tok); tok = new ASCIIFoldingFilter(tok); if (shingles) { tok = new ShingleFilter(tok, 5); }//from ww w . j a v a2s . c o m return new TokenStreamComponents(analyzer, tok) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; }
From source file:it.cnr.isti.hpc.dexter.analysis.SpotAnalyzer.java
License:Apache License
@Override protected Reader initReader(String fieldName, Reader reader) { CharFilter cf = new PatternReplaceCharFilter(Pattern.compile("^[ ]*the +(.*)"), "$1", reader); cf = new PatternReplaceCharFilter(Pattern.compile("[*!`{}~='<>:/%|=+_]"), " ", cf); cf = new PatternReplaceCharFilter(Pattern.compile("^[ ]*a +(.*)"), "$1", cf); cf = new PatternReplaceCharFilter(Pattern.compile("^(.*) \\(.*\\)$"), "$1", cf); cf = new PatternReplaceCharFilter(Pattern.compile("^(.*)#.*$"), "$1", cf); cf = new PatternReplaceCharFilter(Pattern.compile("[, ]*[sjSJ][rR][.]?"), "", cf); cf = new PatternReplaceCharFilter(Pattern.compile(" ([A-Z][.] ?)+"), " ", cf); cf = new PatternReplaceCharFilter(Pattern.compile("^([A-Z][.] ?)+ "), " ", cf); cf = new PatternReplaceCharFilter(Pattern.compile(" [A-Z][.]$"), " ", cf); cf = new HTMLStripCharFilter(reader); return cf;//from www. j av a2 s .c o m }
From source file:mllab_lucene.StandardAnalyzerHtml.java
License:Apache License
@Override //INSERTED protected Reader initReader(String fieldName, Reader reader) { return new HTMLStripCharFilter(reader); }
From source file:org.apache.solr.handler.dataimport.HTMLStripTransformer.java
License:Apache License
private Object stripHTML(String value, String column) { StringBuilder out = new StringBuilder(); StringReader strReader = new StringReader(value); try {// w w w. j a v a2 s .c o m HTMLStripCharFilter html = new HTMLStripCharFilter( strReader.markSupported() ? strReader : new BufferedReader(strReader)); char[] cbuf = new char[1024 * 10]; while (true) { int count = html.read(cbuf); if (count == -1) break; // end of stream mark is -1 if (count > 0) out.append(cbuf, 0, count); } html.close(); } catch (IOException e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Failed stripping HTML for column: " + column, e); } return out.toString(); }
From source file:org.apache.solr.update.processor.HTMLStripFieldUpdateProcessorFactory.java
License:Apache License
@Override public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {// w w w. j ava2 s .c om return new FieldValueMutatingUpdateProcessor(getSelector(), next) { @Override protected Object mutateValue(final Object src) { if (src instanceof CharSequence) { CharSequence s = (CharSequence) src; StringWriter result = new StringWriter(s.length()); Reader in = null; try { in = new HTMLStripCharFilter(new StringReader(s.toString())); IOUtils.copy(in, result); return result.toString(); } catch (IOException e) { // we tried and failed return s; } finally { IOUtils.closeQuietly(in); } } return src; } }; }
From source file:org.craftercms.search.service.impl.HtmlStrippingConverter.java
License:Open Source License
@Override public Object convert(String name, String value) { StringReader reader = new StringReader(value); HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(reader); char[] buffer = new char[BUFFER_SIZE]; StringBuilder strippedValue = new StringBuilder(); try {//from w ww . j av a 2s. co m int charsRead; do { charsRead = htmlStripper.read(buffer); if (charsRead > 0) { strippedValue.append(buffer, 0, charsRead); } } while (charsRead >= 0); } catch (IOException e) { throw new SolrDocumentBuildException("Error while performing HTML stripping for field '" + name + "'", e); } return strippedValue.toString(); }
From source file:org.craftercms.search.service.impl.SolrDocumentBuilder.java
License:Open Source License
protected String stripHtml(String element, String value) throws SolrDocumentBuildException { StringReader reader = new StringReader(value); //HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(CharReader.get(reader)); HTMLStripCharFilter htmlStripper = new HTMLStripCharFilter(reader); char[] buffer = new char[1024 * 10]; StringBuilder strippedValue = new StringBuilder(); try {/*from w ww . j a va 2 s.c om*/ int charsRead; do { charsRead = htmlStripper.read(buffer); if (charsRead > 0) { strippedValue.append(buffer, 0, charsRead); } } while (charsRead >= 0); } catch (IOException e) { throw new SolrDocumentBuildException("Failed to strip the HTML from field '" + element + "'", e); } return strippedValue.toString(); }
From source file:org.emonocot.persistence.dao.hibernate.SearchableDaoImpl.java
License:Open Source License
private String filter(String value) { StringBuilder out = new StringBuilder(); StringReader strReader = new StringReader(value); try {//w w w .jav a 2 s . co m HTMLStripCharFilter html = new HTMLStripCharFilter(new BufferedReader(strReader)); char[] cbuf = new char[1024 * 10]; while (true) { int count = html.read(cbuf); if (count == -1) break; // end of stream mark is -1 if (count > 0) out.append(cbuf, 0, count); } html.close(); } catch (IOException e) { throw new RuntimeException("Failed stripping HTML for value: " + value, e); } return out.toString(); }