-- Register Data Normalization Modue [dnm] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Table Lookup is implemented as a UDF (User Defined function). Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION tablelookup as 'com.pb.bdq.dnm.process.hive.tablelookup.TableLookUpUDF';
-- Set rule
set rule='{"rules":[{"action":"Standardize", "source":"CityCode", "tableName":"State Name Abbreviations", "lookupMultipleWordTerms":false, "lookupIndividualTermsWithinField":false, "destination":"CityCode"}]}';
-- Set Reference Directory. This must be a local path on cluster machines and must be present on each node of the cluster at the same path.
set refdir='/home/hadoop/reference';
-- set header
set header ='AccountDescription,Address,ApartmentNumber,CityCode';
-- Execute Query on the desired table, to display the job output on console. This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/TableLookup/' row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["StandardizationTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["apartmentnumber"],
bar.ret["citycode"]
FROM (
SELECT tablelookup(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, accountdescription, address, apartmentnumber, citycode)
AS ret
FROM citizen_data
) bar;
--Sample input data
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| citizen_data.accountdescription | citizen_data.address | citizen_data.apartmentnumber | citizen_data.citycode |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--| | 400 E M0 St Apt 1405 | | NY |
--| | 190 E 72nd St | | NY |
--| | 1381 3rd Ave Apt 4 | 4 | TTYYY |
--+----------------------------------+-----------------------+-------------------------------+------------------------+
--sample output data
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--|StandardizationTermIdentified | accountdescription | address | apartmentnumber | citycode|
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+
--| yes | | 400 E M0 St Apt 1405 | | NEW YORK |
--| yes | | 190 E 72nd St | | NEW YORK |
--| yes | | 1381 3rd Ave Apt 4 | 4 | NEW YORK |
--+------------------------------+--------------------------+-----------------------+-----------------------+----------+