Selaa lähdekoodia

AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore) (#510) (#527)

* AMBARI-22999 : Ambari Hive View 2.0 'Upload Table' does not support UTF8 files with BOM (nitirajrathore)

* AMBARI-22999 : Added relevant test cases (nitirajrathore)
nitirajrathore 7 vuotta sitten
vanhempi
commit
f37a37b0f6

+ 22 - 3
contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/UploadService.java

@@ -18,6 +18,7 @@
 
 package org.apache.ambari.view.hive20.resources.uploads;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Optional;
 import com.sun.jersey.core.header.FormDataContentDisposition;
 import com.sun.jersey.multipart.FormDataParam;
@@ -46,6 +47,8 @@ import org.apache.ambari.view.hive20.resources.uploads.query.InsertFromQueryInpu
 import org.apache.ambari.view.hive20.utils.ServiceFormattedException;
 import org.apache.ambari.view.hive20.utils.SharedObjectsFactory;
 import org.apache.ambari.view.utils.ambari.AmbariApi;
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.io.input.ReaderInputStream;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
@@ -195,6 +198,19 @@ public class UploadService extends BaseService {
     }
   }
 
+  private Reader getInputStreamReader(InputStream is) throws IOException {
+    BOMInputStream bomInputStream = new BOMInputStream(is,
+        ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
+        ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
+    );
+    if (bomInputStream.hasBOM()) {
+      String charSetName = bomInputStream.getBOMCharsetName();
+      return new InputStreamReader(bomInputStream, charSetName); // return with the encoded charset encoding.
+    } else {
+      return new InputStreamReader(bomInputStream); //return with default charset
+    }
+  }
+
   private CSVParams getCsvParams(String csvDelimiter, String csvQuote, String csvEscape) {
     char csvq =  CSVParams.DEFAULT_QUOTE_CHAR;
     char csvd =  CSVParams.DEFAULT_DELIMITER_CHAR;
@@ -455,7 +471,8 @@ public class UploadService extends BaseService {
     else return e.getMessage();
   }
 
-  private PreviewData generatePreview(Boolean isFirstRowHeader, String inputFileType, CSVParams csvParams, InputStream uploadedInputStream) throws Exception {
+  @VisibleForTesting
+  PreviewData generatePreview(Boolean isFirstRowHeader, String inputFileType, CSVParams csvParams, InputStream uploadedInputStream) throws Exception {
     ParseOptions parseOptions = new ParseOptions();
     parseOptions.setOption(ParseOptions.OPTIONS_FILE_TYPE, inputFileType);
     if (inputFileType.equals(ParseOptions.InputFileType.CSV.toString())){
@@ -473,7 +490,8 @@ public class UploadService extends BaseService {
 
     LOG.info("isFirstRowHeader : {}, inputFileType : {}", isFirstRowHeader, inputFileType);
 
-    DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions);
+    Reader reader = getInputStreamReader(uploadedInputStream);
+    DataParser dataParser = new DataParser(reader, parseOptions);
 
     return dataParser.parsePreview();
   }
@@ -519,7 +537,8 @@ public class UploadService extends BaseService {
       parseOptions.setOption(ParseOptions.OPTIONS_CSV_QUOTE, csvParams.getCsvQuote());
     }
 
-    DataParser dataParser = new DataParser(new InputStreamReader(uploadedInputStream), parseOptions);
+    Reader reader = getInputStreamReader(uploadedInputStream);
+    DataParser dataParser = new DataParser(reader, parseOptions);
 
     Reader csvReader = new TableDataReader(dataParser.iterator(), header, containsEndlines); // encode column values into HEX so that \n etc dont appear in the hive table data
     String path = uploadIntoTable(csvReader, databaseName, tableName);

+ 8 - 0
contrib/views/hive20/src/main/java/org/apache/ambari/view/hive20/resources/uploads/parsers/PreviewData.java

@@ -54,4 +54,12 @@ public class PreviewData {
   public void setPreviewRows(List<Row> previewRows) {
     this.previewRows = previewRows;
   }
+
+  @Override
+  public String toString() {
+    return "PreviewData{" +
+        "header=" + header +
+        ", previewRows=" + previewRows +
+        '}';
+  }
 }

+ 8 - 5
contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/internal/query/generators/InsertFromQueryGeneratorSpecTest.groovy

@@ -28,7 +28,7 @@ class InsertFromQueryGeneratorSpecTest extends Specification {
     setup:
     List<ColumnInfo> colInfos = Arrays.asList(new ColumnInfo("col1", "STRING"), new ColumnInfo("col2", "INT"), new ColumnInfo("col3", "VARCHAR", 255),
             new ColumnInfo("col4", "CHAR", 25))
-    InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", colInfos, false)
+    InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", Collections.emptyList(), colInfos, null, false)
     InsertFromQueryGenerator generator = new InsertFromQueryGenerator(insertFromQueryInput);
 
     when:
@@ -41,14 +41,16 @@ class InsertFromQueryGeneratorSpecTest extends Specification {
     String queryStr = query.get();
 
     then:
-    queryStr == "INSERT INTO TABLE `d2`.`t2` SELECT `col1`, `col2`, `col3`, `col4` FROM `d1.t1` ;"
+    queryStr == "set hive.exec.dynamic.partition.mode=nonstrict;\n" +
+            " FROM `d1`.`t1` tempTable INSERT INTO TABLE `d2`.`t2` SELECT tempTable.`col1`, tempTable.`col2`, tempTable.`col3`, tempTable.`col4`;"
   }
 
-  def "insert from with unhexing"() {
+  def "insert from with unhexing and partitioned columns"() {
     setup:
     List<ColumnInfo> colInfos = Arrays.asList(new ColumnInfo("col1", "STRING"), new ColumnInfo("col2", "INT"), new ColumnInfo("col3", "VARCHAR", 255),
             new ColumnInfo("col4", "CHAR", 25))
-    InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", colInfos, true)
+    List<ColumnInfo> partititionedCols = Arrays.asList(new ColumnInfo("col5", "STRING"), new ColumnInfo("col6", "INT"))
+    InsertFromQueryInput insertFromQueryInput = new InsertFromQueryInput("d1", "t1", "d2", "t2", partititionedCols, colInfos, null, true)
     InsertFromQueryGenerator generator = new InsertFromQueryGenerator(insertFromQueryInput);
 
     when:
@@ -61,6 +63,7 @@ class InsertFromQueryGeneratorSpecTest extends Specification {
     String queryStr = query.get();
 
     then:
-    queryStr == "INSERT INTO TABLE `d2`.`t2` SELECT UNHEX(`col1`), `col2`, UNHEX(`col3`), UNHEX(`col4`) FROM `d1.t1` ;"
+    queryStr ==  "set hive.exec.dynamic.partition.mode=nonstrict;\n" +
+            " FROM `d1`.`t1` tempTable INSERT INTO TABLE `d2`.`t2` PARTITION (`col5`,`col6` )  SELECT UNHEX(tempTable.`col1`), tempTable.`col2`, UNHEX(tempTable.`col3`), UNHEX(tempTable.`col4`), UNHEX(tempTable.`col5`), tempTable.`col6`;"
   }
 }

+ 60 - 0
contrib/views/hive20/src/test/java/org/apache/ambari/view/hive20/resources/uploads/UploadServiceTest.java

@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ambari.view.hive20.resources.uploads;
+
+import org.apache.ambari.view.hive20.client.Row;
+import org.apache.ambari.view.hive20.internal.dto.ColumnInfo;
+import org.apache.ambari.view.hive20.resources.uploads.parsers.PreviewData;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Arrays;
+
+public class UploadServiceTest {
+
+  @Test
+  public void generatePreviewWithBOM() throws Exception {
+    UploadService uploadService = new UploadService();
+    // convert String into InputStream
+    String str = "\ufeffCol1\tCol2\nA\tB\n";
+    InputStream inputStream = new ByteArrayInputStream(str.getBytes());
+    PreviewData previewData = uploadService.generatePreview(true, "CSV", new CSVParams('\t', '\"', '\\'), inputStream);
+
+    Assert.assertEquals("Incorrect number of columns detected.", 2, previewData.getHeader().size() );
+    Assert.assertEquals("incorrect col objects.", Arrays.asList(new ColumnInfo("Col1", "CHAR", null, null, null),
+        new ColumnInfo("Col2", "CHAR", null, null, null)), previewData.getHeader());
+    Assert.assertEquals("incorrect row objects.", Arrays.asList(new Row(new Object[]{"A", "B"})), previewData.getPreviewRows());
+  }
+
+  @Test
+  public void generatePreviewWithoutBOM() throws Exception {
+    UploadService uploadService = new UploadService();
+    // convert String into InputStream
+    String str = "Col1\tCol2\nA\tB\n";
+    InputStream inputStream = new ByteArrayInputStream(str.getBytes());
+    PreviewData previewData = uploadService.generatePreview(true, "CSV", new CSVParams('\t', '\"', '\\'), inputStream);
+
+    Assert.assertEquals("Incorrect number of columns detected.", 2, previewData.getHeader().size() );
+    Assert.assertEquals("incorrect col objects.", Arrays.asList(new ColumnInfo("Col1", "CHAR", null, null, null),
+        new ColumnInfo("Col2", "CHAR", null, null, null)), previewData.getHeader());
+    Assert.assertEquals("incorrect row objects.", Arrays.asList(new Row(new Object[]{"A", "B"})), previewData.getPreviewRows());
+  }
+}