-- Register Universal Name Module [UNM] BDQ Hive UDF Jar
ADD JAR <Directory path>/unm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
-- Open Name Parser is implemented as a UDF (User Defined function). Hence it processes one row at a time and generates a map of key value pairs for each row.
CREATE TEMPORARY FUNCTION opennameparser as 'com.pb.bdq.unm.process.hive.opennameparser.OpenNameParserUDF';
-- set rule
set rule='{"name":"name", "culture":"", "splitConjoinedNames":false, "shortcutThreshold":0, "parseNaturalOrderPersonalNames":false, "naturalOrderPersonalNamesPriority":1, "parseReverseOrderPersonalNames":false, "reverseOrderPersonalNamesPriority":2, "parseConjoinedNames":false, "naturalOrderConjoinedPersonalNamesPriority":3, "reverseOrderConjoinedPersonalNamesPriority":4, "parseBusinessNames":false, "businessNamesPriority":5}';
-- Set Reference Directory. This must be a local path on cluster machines and must be present at the same path on each node of the cluster.
set refdir='/home/hadoop/reference/';
-- set header
set header='inputrecordid,Name,nametype';
-- Execute Query on the desired table, to display the job output on console. This query returns a map of key value pairs containing output fields for each row.
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"] from (select opennameparser(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, inputrecordid, name, nametype) as tmp1 from nameparser) as tmp LATERAL VIEW explode(tmp1) adTable AS adid;
-- Query to dump output data to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/opennameparser/' row format delimited FIELDS TERMINATED BY ',' lines terminated by '\n' STORED AS TEXTFILE
select adTable.adid["Name"], adTable.adid["NameScore"], adTable.adid["CultureCode"] from (select opennameparser(${hiveconf:rule}, ${hiveconf:refdir}, ${hiveconf:header}, inputrecordid, name, nametype) as tmp1 from nameparser) as tmp LATERAL VIEW explode(tmp1) adTable AS adid;
--sample input data
+----------------------------------+--------------------------+-----------------------+
| inputrecordid | name | nametype |
+----------------------------------+--------------------------+-----------------------+
| 1 |JOHN VAN DER LINDEN-JONES | Simple Name |
| 2 |RYAN JOHN SMITH | Simple Name |
+----------------------------------+--------------------------+-----------------------+
--sample output data
+----------------------------+--------------+---------------------+
| Name | NameScore | CultureCode |
+----------------------------+--------------+---------------------+
| JOHN VAN DER LINDEN-JONES | 75 | True |
| RYAN JOHN SMITH | 100 | True |
+----------------------------+-----+------------------------------+