サンプル Hive スクリプト

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION matchkeygenerator as 'com.pb.bdq.amm.process.hive.matchkeygenerator.MatchKeyGeneratorUDF';

-- Match Key Generator is implemented as a UDF (User Defined function). It processes one row at a time and generates a map of match keys for each row.

-- Set rule and header 
set rule='{"matchKeys":[{"expressMatchKey":false, "matchKeyField":"MatchKey1",
"rules":[{"algorithm":"Soundex"," field":"businessname", "startPosition":1, "length":0,"active":true, "sortInput":null, "removeNoiseCharacters":false}]}, 
{"expressMatchKey":false, "matchKeyField":"MatchKey2", "rules":[{"algorithm":"Koeln", "field":"businessname", "startPosition":1, "length":0, "active":true, "sortInput":null, "removeNoiseCharacters":false}]}]}';

set header='businessname,recordid';

-- Execute query on the desired table to display the job output on console. This query returns a map of key value for each row containing matchkeys as per rule passed.
SELECT businessname, recordid, bar.ret["MatchKey1"] AS MatchKey1, bar.ret["MatchKey2"] AS MatchKey2 FROM (
SELECT *, matchkeygenerator (${hiveconf:rule}, ${hiveconf:header}, businessname, recordid) AS ret FROM cust ) bar;


-- Query to dump output to a directory in file system
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/MatchKey/' row format delimited FIELDS TERMINATED BY ',' MAP FIELDS TERMINATED BY ':' COLLECTION ITEMS TERMINATED BY '|' LINES TERMINATED BY '\n' STORED AS TEXTFILE 
SELECT businessname, recordid, bar.ret["MatchKey1"] AS MatchKey1, bar.ret["MatchKey2"] AS MatchKey2 FROM (
SELECT *, matchkeygenerator (${hiveconf:rule}, ${hiveconf:header}, businessname, recordid) AS ret FROM cust ) bar;

--Sample data in input table customer
--+------------------------------------------+----------------+--+
--|            cust.businessname             | cust.recordid  |
--+------------------------------------------+----------------+--+
--| Internal Revenue Service                 | 0              |
--| Juan F Vera-Monroig                      | 1              |
--| Leonardo Pagan-Reyes                     | 2              |
--| Academia San Joaquin Colegios/Academias  | 3              |
--| Nereida Portalatin-Padua                 | 4              |
--+------------------------------------------+----------------+--+


--Sample output for input query
+------------------------------------------+-----------+------------+-------------------+--+
|               businessname               | recordid  | matchkey1  |     matchkey2     |
+------------------------------------------+-----------+------------+-------------------+--+
| Internal Revenue Service                 | 0         | I536       | 0627657368738     |
| Juan F Vera-Monroig                      | 1         | J511       | 063376674         |
| Leonardo Pagan-Reyes                     | 2         | L563       | 567214678         |
| Academia San Joaquin Colegios/Academias  | 3         | A235       | 0426864645484268  |
| Nereida Portalatin-Padua                 | 4         | N631       | 67217252612       |
+------------------------------------------+-----------+------------+-------------------+--+