Reference data placed on local
node
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes
represent class names needed for this job to run.
-- Advanced Transformer is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION advanceTransform
as 'com.pb.bdq.dnm.process.hive.advancedtransformer.AdvancedTransformerUDF';
-- Set rule
set hivevar:rule='{"rules":[{"extractionType":"TableData",
"source":"address", "nonExtractedData":"address_1", "extractedData":"address_2",
"tokenizationCharacters":"", "tableName":"Street Suffix Abbreviations", "multipleTermLookup":false,
"tokenize":true, "extract":"ExtractTerm", "includeTermWith":"ExtractedData", "wordsToExtract":2}]}';
-- Set Reference Directory. This must be a local path on cluster machines and must
be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"LocaltoDataNodes",
"dataDir":"/home/data/referenceData"}';
-- set header
set hivevar:header ='AccountDescription,Address';
set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output
on console. This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/AdvXformer/' row format
delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
--sample input data
+----------------------------------+---------------------+-----------------------+
| AdvancedTransformTermIdentified | accountdescription | address |
+----------------------------------+---------------------+-----------------------+
| Yes | | 400 E M0 St Apt 1405 |
| Yes | | 190 E 72nd St |
+----------------------------------+---------------------+-----------------------+
--sample output data
+----------------------------------+---------------------+-----------------------+--------------------+
| AdvancedTransformTermIdentified | accountdescription | address | address_1 |
+----------------------------------+---------------------+-----------------------+--------------------+
| Yes | | 400 E M0 St Apt 1405 | 400 E M0 Apt 1405 |
| Yes | | 190 E 72nd St | 190 E 72nd |
+----------------------------------+---------------------+-----------------------+--------------------+
Reference
data placed on HDFS and downloaded on local nodes for jobs
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Advanced Transformer is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION advanceTransform as
'com.pb.bdq.dnm.process.hive.advancedtransformer.AdvancedTransformerUDF';
-- Set rule
set hivevar:rule='{"rules":[{"extractionType":"TableData", "source":"address",
"nonExtractedData":"address_1", "extractedData":"address_2", "tokenizationCharacters":"",
"tableName":"Street Suffix Abbreviations", "multipleTermLookup":false, "tokenize":true,
"extract":"ExtractTerm", "includeTermWith":"ExtractedData", "wordsToExtract":2}]}';
-- Set reference data details for Download manager, paas dataDir where data resides in HDFS
and localFS path to download the data and dataDownloader as HDFS
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"/home/data/dm/referenceData",
"dataDownloader":{"dataDownloader":"HDFS","localFSRepository":"/local/download"}}';
set hive.fetch.task.conversion=none;
-- set header
set hivevar:header ='AccountDescription,Address';
set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output on
console. This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/AdvXformer/'
row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
--sample input data
+----------------------------------+---------------------+-----------------------+
| AdvancedTransformTermIdentified | accountdescription | address |
+----------------------------------+---------------------+-----------------------+
| Yes | | 400 E M0 St Apt 1405 |
| Yes | | 190 E 72nd St |
+----------------------------------+---------------------+-----------------------+
--sample output data
+----------------------------------+---------------------+-----------------------+--------------------+
| AdvancedTransformTermIdentified | accountdescription | address | address_1 |
+----------------------------------+---------------------+-----------------------+--------------------+
| Yes | | 400 E M0 St Apt 1405 | 400 E M0 Apt 1405 |
| Yes | | 190 E 72nd St | 190 E 72nd |
+----------------------------------+---------------------+-----------------------+--------------------+
Reference data placed on HDFS and downloaded to working directory for jobs
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Advanced Transformer is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION advanceTransform as
'com.pb.bdq.dnm.process.hive.advancedtransformer.AdvancedTransformerUDF';
-- HDFS Reference data- via Distributed Cache DC- unarchive
ADD FILES hdfs://<HOST>:<PORT>/home/hduser/referenceData/;
--HDFS reference data- via Distributed cache -archive form of reference data
--ADD ARCHIVE hdfs://<HOST>:<PORT>/home/hduser/referenceData.zip;
-- Set rule
set hivevar:rule='{"rules":[{"extractionType":"TableData",
"source":"address", "nonExtractedData":"address_1", "extractedData":"address_2",
"tokenizationCharacters":"", "tableName":"Street Suffix Abbreviations", "multipleTermLookup":false,
"tokenize":true, "extract":"ExtractTerm", "includeTermWith":"ExtractedData", "wordsToExtract":2}]}';
-- Set Reference Data details .
--reference data details, can be added in zip or unarchive form, dataDir symbolises reference data
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"./referenceData","dataDownloader":{"dataDownloader":"DC"}}';
-- below format for archive form
--set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"./referenceData.zip","dataDownloader":{"dataDownloader":"DC"}}';
-- set header
set hivevar:header ='AccountDescription,Address';
set hive.fetch.task.conversion=none;
-- Execute Query on the desired table, to display the job output on console.
This query returns a map of key value pairs containing output fields for each row.
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/AdvXformer/'
row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
SELECT bar.ret["AdvancedTransformTermIdentified"],
bar.ret["accountdescription"],
bar.ret["address"],
bar.ret["address_1"]
FROM (
SELECT advanceTransform(${hivevar:rule}, ${hivevar:refereceDataDetails},
${hivevar:header}, accountdescription, address)
AS ret
FROM advxformX
) bar;
--sample input data
+----------------------------------+---------------------+-----------------------+
| AdvancedTransformTermIdentified | accountdescription | address |
+----------------------------------+---------------------+-----------------------+
| Yes | | 400 E M0 St Apt 1405 |
| Yes | | 190 E 72nd St |
+----------------------------------+---------------------+-----------------------+
--sample output data
+----------------------------------+---------------------+-----------------------+--------------------+
| AdvancedTransformTermIdentified | accountdescription | address | address_1 |
+----------------------------------+---------------------+-----------------------+--------------------+
| Yes | | 400 E M0 St Apt 1405 | 400 E M0 Apt 1405 |
| Yes | | 190 E 72nd St | 190 E 72nd |
+----------------------------------+---------------------+-----------------------+--------------------+