サンプル Hive スクリプト

-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar 
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Advanced Transformer is implemented as a UDF (User Defined function). Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION advanceTransform as 'com.pb.bdq.dnm.process.hive.advancetransformer.AdvanceTransformerUDF';

-- Set rule 
set rule='{"rules":[{"extractionType":"TableData", "source":"address", "nonExtractedData":"address_1", "extractedData":"address_2", "tokenizationCharacters":"", "tableName":"Street Suffix Abbreviations", "multipleTermLookup":false, "tokenize":true, "extract":"ExtractTerm", "includeTermWith":"ExtractedData", "wordsToExtract":2}]}';


-- Set Reference Directory. This must be a local path on cluster machines and must be present on each node of the cluster at the same path.
set refdir='/home/hadoop/reference/';

-- set header
set header ='AccountDescription,Address';

-- Execute Query on the desired table, to display the job output on console. This query returns a map of key value pairs containing output fields for each row.

SELECT bar.ret["AdvancedTransformTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["address_1"] 
FROM (
	SELECT advanceTransform(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address) 
	AS ret 
	FROM advxformX
	) bar;


-- Query to dump output data to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/AdvXformer/' row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE 
SELECT bar.ret["AdvancedTransformTermIdentified"],
	bar.ret["accountdescription"],
	bar.ret["address"],
	bar.ret["address_1"] 
FROM (
	SELECT advanceTransform(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address) 
	AS ret 
	FROM advxformX
	) bar;

--sample input data
+----------------------------------+---------------------+-----------------------+
| AdvancedTransformTermIdentified  | accountdescription  |      address          |
+----------------------------------+---------------------+-----------------------+
| Yes                              |      				 | 400 E M0 St Apt 1405  |
| Yes                              |                     | 190 E 72nd St         |
+----------------------------------+---------------------+-----------------------+

--sample output data
+----------------------------------+---------------------+-----------------------+--------------------+
| AdvancedTransformTermIdentified  | accountdescription  |      address          |     address_1      |
+----------------------------------+---------------------+-----------------------+--------------------+
| Yes                              |      				 | 400 E M0 St Apt 1405  | 400 E M0 Apt 1405  |
| Yes                              |                     | 190 E 72nd St         | 190 E 72nd         |
+----------------------------------+---------------------+-----------------------+--------------------+