Java tutorial
/* * Copyright (c) 2012, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.fisheye; import com.cloudera.recordbreaker.analyzer.FSAnalyzer; import com.cloudera.recordbreaker.analyzer.SchemaUtils; import com.cloudera.recordbreaker.analyzer.FileSummary; import com.cloudera.recordbreaker.analyzer.DataDescriptor; import com.cloudera.recordbreaker.analyzer.FileSummaryData; import com.cloudera.recordbreaker.analyzer.SchemaDescriptor; import org.apache.wicket.model.Model; import org.apache.wicket.AttributeModifier; import org.apache.wicket.markup.html.basic.Label; import org.apache.wicket.markup.html.list.ListView; import org.apache.wicket.markup.html.list.ListItem; import org.apache.wicket.markup.repeater.RepeatingView; import org.apache.wicket.markup.html.WebMarkupContainer; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericRecord; import java.util.List; import java.util.Arrays; import java.util.TreeMap; import java.util.Iterator; import java.util.ArrayList; import java.util.Collections; import java.io.IOException; import java.io.Serializable; /**************************************************** * <code>FileContentsTable</code> is a panel that shows the (structured) * contents of a FishEye file. * * @author "Michael Cafarella" <mjc@cloudera.com> ****************************************************/ public class FileContentsTable extends WebMarkupContainer { long fid = -1L; final static int MAX_ROWS = 100; public FileContentsTable() { super("filecontentstable"); setOutputMarkupPlaceholderTag(true); setVisibilityAllowed(false); } class HeaderPair implements Serializable { String s; int count; boolean isBottom; public HeaderPair(String s, int count) { this.s = s; this.count = count; this.isBottom = false; } void bumpCount() { count += 1; } String getString() { return s; } int getColSpan() { return count; } boolean isBottom() { return isBottom; } void setBottom(boolean isBottom) { this.isBottom = isBottom; } } class DataField implements Serializable { String fieldName; boolean isStringVal; String dataStr; String filename; public DataField(String fieldName, Object dataObj, String filename) { this.fieldName = fieldName; this.isStringVal = !((dataObj instanceof Integer) || (dataObj instanceof Double) || (dataObj instanceof Float)); this.dataStr = "" + dataObj; this.filename = filename; } public String getDataFieldName() { return fieldName; } public boolean isStringVal() { return isStringVal; } public String getDataStr() { return dataStr; } public String getFilename() { return filename; } } class DataTablePair implements Serializable { List<List<HeaderPair>> headerPairs; List<List<DataField>> outputTupleList; public DataTablePair(List<List<HeaderPair>> headerPairs, List<List<DataField>> outputTupleList) { this.headerPairs = headerPairs; this.outputTupleList = outputTupleList; } List<List<HeaderPair>> getHeaderPairs() { return headerPairs; } List<List<DataField>> getTupleList() { return outputTupleList; } } void renderToPage(String label, List<DataTablePair> tablePairs, final boolean renderLinks) { final long localFid = this.fid; add(new ListView<DataTablePair>(label, tablePairs) { protected void populateItem(ListItem<DataTablePair> outerItem) { DataTablePair outerModelObj = outerItem.getModelObject(); List<List<HeaderPair>> outputHeaderList = outerModelObj.getHeaderPairs(); List<List<DataField>> outputTupleList = outerModelObj.getTupleList(); outerItem.add(new ListView<List<HeaderPair>>("attributelabels", outputHeaderList) { protected void populateItem(ListItem<List<HeaderPair>> item) { List<HeaderPair> myListOfFieldLabels = item.getModelObject(); ListView<HeaderPair> listOfFields = new ListView<HeaderPair>("fieldlist", myListOfFieldLabels) { protected void populateItem(ListItem<HeaderPair> item2) { HeaderPair displayPair = item2.getModelObject(); item2.add(new Label("alabel", "" + displayPair.getString())); item2.add(new AttributeModifier("colspan", true, new Model("" + displayPair.getColSpan()))); if (!displayPair.isBottom()) { item2.add(new AttributeModifier("style", true, new Model("text-align:center"))); } } }; item.add(listOfFields); } }); outerItem.add(new ListView<List<DataField>>("schemalistview", outputTupleList) { protected void populateItem(ListItem<List<DataField>> item) { List<DataField> myListOfSchemaElts = item.getModelObject(); ListView<DataField> listofTupleFields = new ListView<DataField>("tupleview", myListOfSchemaElts) { protected void populateItem(ListItem<DataField> item2) { DataField dataField = item2.getModelObject(); // // Build list of suggested queries for the HTML popover. // // 1. SELECT * FROM DATA WHERE ATTR = 'celltext' // <others coming> // String totalHTML = ""; WebMarkupContainer popovercontent = new WebMarkupContainer("popovercontent"); Label celltextalone = new Label("celltextalone", "" + dataField.getDataStr()); item2.add(popovercontent); item2.add(celltextalone); if (renderLinks && dataField.getDataStr().length() > 0) { String sqlQueryText = "SELECT * FROM <i>DATA</i> WHERE " + dataField.getDataFieldName() + " = " + (dataField.isStringVal() ? "'" : "") + dataField.getDataStr() + (dataField.isStringVal() ? "'" : ""); String selectionClause = dataField.getDataFieldName() + "+%3D+" + (dataField.isStringVal() ? "%27" : "") + dataField.getDataStr() + (dataField.isStringVal() ? "%27" : ""); String sqlHyperlink = "/QueryResults?fid=" + localFid + "&projectionclause=*" + "&selectionclause=" + selectionClause + "&filename=" + dataField.getFilename(); totalHTML = "<ul><li><a href='" + sqlHyperlink + "'>" + sqlQueryText + "</a></ul>"; popovercontent .add(new AttributeModifier("data-content", true, new Model(totalHTML))); popovercontent.add(new Label("celltext", "" + dataField.getDataStr())); popovercontent.setVisibilityAllowed(true); celltextalone.setVisibilityAllowed(false); } else { popovercontent.setVisibilityAllowed(false); celltextalone.setVisibilityAllowed(true); } } }; item.add(listofTupleFields); } }); } }); } void getSchemaFromData(GenericRecord gr) { } public FileContentsTable(long fid) { super("filecontentstable"); this.fid = fid; FishEye fe = FishEye.getInstance(); FSAnalyzer fsa = fe.getAnalyzer(); FileSummaryData fsd = fsa.getFileSummaryData(fid); String path = fsd.path + fsd.fname; DataDescriptor dd = fsd.getDataDescriptor(); final boolean querySupported = dd.isHiveSupported() && fe.isQueryServerAvailable(false); List<SchemaDescriptor> sds = dd.getSchemaDescriptor(); class SchemaPair implements Comparable { int schemaId; int count; public SchemaPair(int schemaId, int count) { this.schemaId = schemaId; this.count = count; } public int compareTo(Object o) { SchemaPair sp = (SchemaPair) o; int result = count - sp.count; if (result == 0) { result = schemaId - sp.schemaId; } return result; } } if (sds.size() > 0) { SchemaDescriptor sd = sds.get(0); Schema schema = sd.getSchema(); // // Step 1. Figure out the hierarchical labels from the Schema. // These are the fields we'll grab from each tuple. // // Doing so entails "unrolling" the schemas that contain unions. // That is, translating such schemas into a set of union-free schemas. // List<List<List<DataField>>> perSchemaTupleLists = new ArrayList<List<List<DataField>>>(); List<List<List<DataField>>> dataOrderTupleLists = new ArrayList<List<List<DataField>>>(); List<Integer> schemaOrder = new ArrayList<Integer>(); List<SchemaPair> schemaFrequency = new ArrayList<SchemaPair>(); int numRows = 0; TreeMap<String, Schema> uniqueUnrolledSchemas = new TreeMap<String, Schema>(); for (Iterator it = sd.getIterator(); it.hasNext();) { GenericData.Record gr = (GenericData.Record) it.next(); List<Schema> grSchemas = SchemaUtils.unrollUnionsWithData(schema, gr, false); if (grSchemas != null) { for (Schema grs : grSchemas) { if (uniqueUnrolledSchemas.get(grs.toString()) == null) { uniqueUnrolledSchemas.put(grs.toString(), grs); } } } if (numRows >= MAX_ROWS) { break; } numRows++; } List<Schema> allSchemas = new ArrayList(uniqueUnrolledSchemas.values()); List<List<String>> schemaLabelLists = new ArrayList<List<String>>(); for (int i = 0; i < allSchemas.size(); i++) { Schema s1 = allSchemas.get(i); schemaLabelLists.add(SchemaUtils.flattenNames(s1)); perSchemaTupleLists.add(new ArrayList<List<DataField>>()); schemaFrequency.add(new SchemaPair(i, 0)); } // // Step 2. Build the set of rows for display. One row per tuple. // numRows = 0; boolean incompleteFileScan = false; int lastBestIdx = -1; boolean hasMoreRows = false; for (Iterator it = sd.getIterator(); it.hasNext();) { GenericData.Record gr = (GenericData.Record) it.next(); if (numRows >= MAX_ROWS) { hasMoreRows = true; incompleteFileScan = true; break; } // OK, now the question is: which schema does the row observe? int maxGood = 0; int bestIdx = -1; int i = 0; List<String> bestSchemaLabels = null; for (List<String> schemaLabels : schemaLabelLists) { int numGood = 0; for (String schemaHeader : schemaLabels) { Object result = SchemaUtils.getNestedValues(gr, schemaHeader); if (result.toString().length() > 0) { numGood++; } } if (numGood > maxGood) { maxGood = numGood; bestSchemaLabels = schemaLabels; bestIdx = i; } i++; } if (maxGood == 0) { // Some files, especially those recovered through automatic means, may have // lines that do not match any part of the schema; in that case, just skip // to the next line. continue; } List<DataField> tupleElts = new ArrayList<DataField>(); for (String schemaHeader : bestSchemaLabels) { tupleElts.add(new DataField(schemaHeader, SchemaUtils.getNestedValues(gr, schemaHeader), path)); } perSchemaTupleLists.get(bestIdx).add(tupleElts); if (bestIdx != lastBestIdx) { dataOrderTupleLists.add(new ArrayList<List<DataField>>()); } dataOrderTupleLists.get(dataOrderTupleLists.size() - 1).add(tupleElts); schemaOrder.add(bestIdx); schemaFrequency.get(bestIdx).count += 1; lastBestIdx = bestIdx; numRows++; } // // Step 3. Build the hierarchical set of header rows for display. // // schemaLabelLists holds N lists of schema labels, one for each schema. // tupleLists holds N lists of tuples, one for each schema. // schemaOrder holds a list of M indexes, one for each tuple in the data // to be displayed. // //List<List<List<HeaderPair>>> outputHeaderSets = new ArrayList<List<List<HeaderPair>>>(); //List<List<List<String>>> outputTupleLists = null; // // Step 4. Build 3 different display modes. // There are 3 ways to view the data. All 3 get sent to the browser, and the user // can toggle among them. // // "RAW" mode is just the text of the data, as closely as we can formulate it // // "DATAORDER" mode means show the structured data in tables, ordered by the // tuples' appearance in the datafile. // // "SCHEMAORDER" mode means show the structured data in tables, ordered by // most-popular tables first. // // 4a. raw mode List<List<List<HeaderPair>>> rawOutputHeaderSets = new ArrayList<List<List<HeaderPair>>>(); List<List<List<DataField>>> rawOutputTupleLists = new ArrayList<List<List<DataField>>>(); List<List<HeaderPair>> headerSet = new ArrayList<List<HeaderPair>>(); rawOutputHeaderSets.add(headerSet); List<HeaderPair> header = new ArrayList<HeaderPair>(); header.add(new HeaderPair("", 1)); headerSet.add(header); List<List<DataField>> singleTable = new ArrayList<List<DataField>>(); rawOutputTupleLists.add(singleTable); for (List<List<DataField>> tupleList : dataOrderTupleLists) { for (List<DataField> tuple : tupleList) { List<DataField> singleTuple = new ArrayList<DataField>(); StringBuffer sbuf = new StringBuffer(); for (DataField df : tuple) { sbuf.append(df.getDataStr()); sbuf.append(" "); } singleTuple.add(new DataField("", sbuf.toString().trim(), path)); singleTable.add(singleTuple); } } // 4b. dataorder mode List<List<List<HeaderPair>>> dataOutputHeaderSets = new ArrayList<List<List<HeaderPair>>>(); List<List<List<DataField>>> dataOutputTupleLists = dataOrderTupleLists; // Show data in order of how it appears in the file for (int i = 1; i < schemaOrder.size(); i++) { if (schemaOrder.get(i) != schemaOrder.get(i - 1)) { createOutputHeaderSet(schemaLabelLists.get(schemaOrder.get(i - 1)), dataOutputHeaderSets); } } if (schemaOrder.size() > 0) { createOutputHeaderSet(schemaLabelLists.get(schemaOrder.get(schemaOrder.size() - 1)), dataOutputHeaderSets); } // 4c. schemaorder mode List<List<List<HeaderPair>>> schemaOutputHeaderSets = new ArrayList<List<List<HeaderPair>>>(); List<List<List<DataField>>> schemaOutputTupleLists = new ArrayList<List<List<DataField>>>(); // Show data in order of schema popularity by descending frequency SchemaPair sortedByFreq[] = schemaFrequency.toArray(new SchemaPair[schemaFrequency.size()]); Arrays.sort(sortedByFreq, Collections.reverseOrder()); // Iterate through, populate lists for (int i = 0; i < sortedByFreq.length; i++) { if (sortedByFreq[i].count > 0) { createOutputHeaderSet(schemaLabelLists.get(sortedByFreq[i].schemaId), schemaOutputHeaderSets); schemaOutputTupleLists.add(perSchemaTupleLists.get(sortedByFreq[i].schemaId)); } } // // Step 5. Add the info to the display. // final boolean hasCompletedFileScan = !incompleteFileScan; final int scannedRows = numRows; final long fsdSize = fsd.size; add(new WebMarkupContainer("completeScanMessage") { { setOutputMarkupPlaceholderTag(true); setVisibilityAllowed(hasCompletedFileScan); add(new Label("numberofcompletelines", "" + scannedRows)); } }); add(new WebMarkupContainer("incompleteScanMessage") { { setOutputMarkupPlaceholderTag(true); setVisibilityAllowed(!hasCompletedFileScan); add(new Label("numberofincompletelines", "" + scannedRows)); add(new Label("numberOfTotalBytes", "" + fsdSize)); } }); List<DataTablePair> rawTablePairs = new ArrayList<DataTablePair>(); for (int i = 0; i < rawOutputHeaderSets.size(); i++) { rawTablePairs.add(new DataTablePair(rawOutputHeaderSets.get(i), rawOutputTupleLists.get(i))); } renderToPage("rawtables", rawTablePairs, false); List<DataTablePair> dataTablePairs = new ArrayList<DataTablePair>(); for (int i = 0; i < dataOutputHeaderSets.size(); i++) { dataTablePairs.add(new DataTablePair(dataOutputHeaderSets.get(i), dataOutputTupleLists.get(i))); } renderToPage("datatables", dataTablePairs, false); List<DataTablePair> schemaTablePairs = new ArrayList<DataTablePair>(); for (int i = 0; i < schemaOutputHeaderSets.size(); i++) { schemaTablePairs .add(new DataTablePair(schemaOutputHeaderSets.get(i), schemaOutputTupleLists.get(i))); } renderToPage("schematables", schemaTablePairs, querySupported); } setOutputMarkupPlaceholderTag(true); setVisibilityAllowed(false); } /** * Create a single schema-specific set of tuple information, which includes schema info. * Depending on how the user wants to view the page and the internal order of rows in * a table, a single file could have a large number of different ranges. * */ void createOutputHeaderSet(List<String> schemaLabels, List<List<List<HeaderPair>>> outputHeaderSets) { int maxDepth = 1; for (String s : schemaLabels) { int curDepth = s.split("\\.").length; maxDepth = Math.max(maxDepth, curDepth); } List<List<HeaderPair>> headerSet = new ArrayList<List<HeaderPair>>(); for (int i = 0; i < maxDepth; i++) { headerSet.add(new ArrayList<HeaderPair>()); } for (String s : schemaLabels) { String parts[] = s.split("\\."); for (int i = 0; i < parts.length; i++) { headerSet.get(maxDepth - i - 1).add(new HeaderPair(parts[parts.length - i - 1], 1)); } for (int i = parts.length; i < maxDepth; i++) { headerSet.get(maxDepth - i - 1).add(new HeaderPair("", 1)); } } List<List<HeaderPair>> newHeaderSet = new ArrayList<List<HeaderPair>>(); for (List<HeaderPair> singleRow : headerSet) { List<HeaderPair> newHeaderRow = new ArrayList<HeaderPair>(); HeaderPair lastHp = singleRow.get(0); for (int i = 1; i < singleRow.size(); i++) { HeaderPair hp = singleRow.get(i); if (hp.getString().equals(lastHp.getString())) { lastHp.bumpCount(); } else { newHeaderRow.add(lastHp); lastHp = hp; } } newHeaderRow.add(lastHp); newHeaderSet.add(newHeaderRow); } List<HeaderPair> bottomLine = newHeaderSet.get(newHeaderSet.size() - 1); for (HeaderPair hp : bottomLine) { hp.setBottom(true); } outputHeaderSets.add(newHeaderSet); } public void onConfigure() { if (fid < 0) { setVisibilityAllowed(false); } else { FishEye fe = FishEye.getInstance(); AccessController accessCtrl = fe.getAccessController(); FSAnalyzer fsAnalyzer = fe.getAnalyzer(); FileSummary fileSummary = new FileSummary(fsAnalyzer, fid); try { setVisibilityAllowed(fe.hasFSAndCrawl() && accessCtrl.hasReadAccess(fileSummary) && fileSummary.isDir() && fileSummary.getDataDescriptor().getSchemaDescriptor().size() > 0); } catch (IOException iex) { setVisibilityAllowed(false); } } } }