Reference data placed on local node
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;
-- Provide alias to UDF class (optional). String in quotes
represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION openparser as 'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';
set hive.fetch.task.conversion=minimal;
set hivevar:header='InputKeyValue,addressline';
-- Set Reference Directory. This must be a local path on
cluster machines and must be present on each node of the cluster at the same path.
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"LocaltoDataNodes","dataDir":"/home/data/referenceData"}';
-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":false,
"grammerFilePath":"Open_parser_grammer_tc1.txt"}';
--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output'
row format delimited FIELDS TERMINATED BY '|'
lines terminated by '\n'
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
,coalesce(tmp2.record ["addressline"], '')
,coalesce(tmp2.record ["ApartmentNumber"], '')
,coalesce(tmp2.record ["HouseNumber"], '')
,coalesce(tmp2.record ["IsParsed"], '')
,coalesce(tmp2.record ["LeadingDirectional"], '')
,coalesce(tmp2.record ["ParserScore"], '')
,coalesce(tmp2.record ["POBox"], '')
,coalesce(tmp2.record ["PrivateMailbox"], '')
,coalesce(tmp2.record ["RRHC"], '')
,coalesce(tmp2.record ["RuleID"], '')
,coalesce(tmp2.record ["StreetName"], '')
,coalesce(tmp2.record ["StreetSuffix"], '')
,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails},
${hivevar:header}, InputKeyValue, addressline) AS mygp
FROM openparserinputtable1
) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;
Reference data placed on HDFS and downloaded on local nodes for jobs
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION openparser as 'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';
set hive.fetch.task.conversion=minimal;
set hivevar:header='InputKeyValue,addressline';
-- Set reference data details for Download manager, paas
dataDir where data resides in HDFS and localFS path to download the data and dataDownloader as HDFS
set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"HDFS","dataDir":"/home/data/dm/referenceData","dataDownloader":
{"dataDownloader":"HDFS","localFSRepository":"/local/download"}}';
set hive.fetch.task.conversion=none;
-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":false,
"grammerFilePath":"Open_parser_grammer_tc1.txt"}';
--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output'
row format delimited FIELDS TERMINATED BY '|'
lines terminated by '\n'
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
,coalesce(tmp2.record ["addressline"], '')
,coalesce(tmp2.record ["ApartmentNumber"], '')
,coalesce(tmp2.record ["HouseNumber"], '')
,coalesce(tmp2.record ["IsParsed"], '')
,coalesce(tmp2.record ["LeadingDirectional"], '')
,coalesce(tmp2.record ["ParserScore"], '')
,coalesce(tmp2.record ["POBox"], '')
,coalesce(tmp2.record ["PrivateMailbox"], '')
,coalesce(tmp2.record ["RRHC"], '')
,coalesce(tmp2.record ["RuleID"], '')
,coalesce(tmp2.record ["StreetName"], '')
,coalesce(tmp2.record ["StreetSuffix"], '')
,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails},
${hivevar:header}, InputKeyValue, addressline) AS mygp
FROM openparserinputtable1
) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;
Reference data placed on HDFS and downloaded to working directory for jobs
-- Register Data Normalisation Module [DNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/dnm.hive.${project.version}.jar;
-- Register the grammar for open parser
ADD FILE /home/hadoop/testing/openparser/hive/testdata/Open_parser_grammer_tc1.txt;
-- HDFS Reference data- via Distributed Cache DC- unarchive
ADD FILES hdfs://<HOST>:<PORT>/home/hduser/referenceData/;
--HDFS reference data- via Distributed cache -archive form of reference data
--ADD ARCHIVE hdfs://<HOST>:<PORT>/home/hduser/referenceData.zip;
-- Provide alias to UDF class (optional).
String in quotes represent class names needed for this job to run.
-- Open Parser is implemented as a UDF(User Defined function).
Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION openparser as
'com.pb.bdq.dnm.process.hive.openparser.OpenParserUDF';
set hive.fetch.task.conversion=minimal;
set hivevar:header='InputKeyValue,addressline';
-- Set Reference Data details .
--reference data details, can be added in zip or
unarchive form, dataDir symbolises reference data
set hivevar:refereceDataDetails='{"referenceDataPathLocation":"HDFS",
"dataDir":"./referenceData","dataDownloader":{"dataDownloader":"DC"}}';
-- below format for archive form
--set hivevar:refereceDataDetails='{"referenceDataPathLocation":
"HDFS","dataDir":"./referenceData.zip","dataDownloader":{"dataDownloader":"DC"}}';
-- Set rule with grammar file name
set hivevar:rule='{"grammar":null,"domain":null,"defaultCulture":null,
"culture":null,"returnMultipleParsedRecords":false,"debug":
false,"grammerFilePath":"Open_parser_grammer_tc1.txt"}';
--Execute Query on desired table to sump the output to local dir.
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/testing/openparser/hive/testdata/output'
row format delimited FIELDS TERMINATED BY '|'
lines terminated by '\n'
STORED AS TEXTFILE
SELECT coalesce(tmp2.record ["InputKeyValue"], '')
,coalesce(tmp2.record ["addressline"], '')
,coalesce(tmp2.record ["ApartmentNumber"], '')
,coalesce(tmp2.record ["HouseNumber"], '')
,coalesce(tmp2.record ["IsParsed"], '')
,coalesce(tmp2.record ["LeadingDirectional"], '')
,coalesce(tmp2.record ["ParserScore"], '')
,coalesce(tmp2.record ["POBox"], '')
,coalesce(tmp2.record ["PrivateMailbox"], '')
,coalesce(tmp2.record ["RRHC"], '')
,coalesce(tmp2.record ["RuleID"], '')
,coalesce(tmp2.record ["StreetName"], '')
,coalesce(tmp2.record ["StreetSuffix"], '')
,coalesce(tmp2.record ["TrailingDirectional"], '')
FROM (
SELECT openparser(${hivevar:rule},${hivevar:refereceDataDetails},
${hivevar:header}, InputKeyValue, addressline) AS mygp
FROM openparserinputtable1
) AS addressgroup LATERAL VIEW explode(addressgroup.mygp) tmp2 AS record;