-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Transactional Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION transactionalMatch as 'com.pb.bdq.amm.process.hive.transactional.TransactionalMatchUDAF';
-- Transactional Match is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time and generates the result for that group of rows.
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'pb.bdq.match.rule'
set pb.bdq.match.rule={"type":"Parent", "children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"KeyboardDistance","weight":0,"options":null},
{"name":"Metaphone3","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}], "matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks", "name":"NameData", "matchWhenNotTrue":false, "threshold":100, "weight":0};
-- Set header(along with id field alias used in query) using configuration property 'pb.bdq.match.header'
set pb.bdq.match.header=name,firstname,lastname,matchkey,middlename,recordid,id;
-- Set sort field name to the alias used in the query, using the configuration property 'pb.bdq.match.sort.field'
set pb.bdq.match.sort.field=id;
-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.candidate.return'. The default value is false.
set pb.bdq.match.unique.candidate.return=true;
-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.
-- Transactional Match returns a list of map containing <key=value> pairs. Each map in the list corresponds to a row in the group. The below query explodes that list of map and fetches fields from map by keys.
SELECT tmp2.record["MatchRecordType"],
tmp2.record["MatchScore"],
tmp2.record["HasDuplicate"],
tmp2.record["name"],
tmp2.record["firstname"],
tmp2.record["lastname"],
tmp2.record["matchkey"],
tmp2.record["middlename"],
tmp2.record["recordid"]
FROM (
SELECT transactionalMatch(innerRowID.name, innerRowID.firstname, innerRowID.lastname, innerRowID.matchkey, innerRowID.middlename, innerRowID.recordid, innerRowID.id
) AS matchgroup
FROM (
SELECT name, firstname, lastname, matchkey, middlename, recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id FROM customer_data
) innerRowID
GROUP BY matchkey
) As innerResult
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;
-- Query to dump output to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/transmatch/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT tmp2.record["MatchRecordType"],
tmp2.record["MatchScore"],
tmp2.record["HasDuplicate"],
tmp2.record["name"],
tmp2.record["firstname"],
tmp2.record["lastname"],
tmp2.record["matchkey"],
tmp2.record["middlename"],
tmp2.record["recordid"]
FROM (
SELECT transactionalMatch(innerRowID.name,
innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.recordid,
innerRowID.id) as matchgroup
FROM (
SELECT name, firstname, lastname, matchkey, middlename, recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id
FROM customer_data
) innerRowID
GROUP BY matchkey ) As innerResult
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;
--sample input data
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| name | firstname | lastname | matchkey | middlename | recordid |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| ZORINA ABDOOL | ZORINA | ABDOOL | Z | | 12 |
--| ZULFIQAR ALI | ZULFIQAR | ALI | Z | | 116 |
--| ZACHARY BENNETT | ZACHARY | BENNETT | Z | | 515 |
--| ZOHAR BUERGER | ZOHAR | BUERGER | Z | | 889 |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--sample output data
--+----------------+---------------------+----------+------------+-----------+-----------------+------------+--------------+
--|name |firstname | lastname | matchkey | middlename | recordid | MatchRecordType | MatchScore | HasDuplicate |
--+----------------+----------+----------+----------+------------+-----------+-----------------+------------+--------------+
--|ZORINA ABDOOL |ZORINA | ABDOOL | Z | | 12 | S | 0 | Y |
--|ZULFIQAR ALI |ZULFIQAR | ALI | Z | | 116 | D | 90 | D |
--|ZACHARY BENNETT|ZACHARY | BENNETT | Z | | 515 | D | 91 | D |
--|ZOHAR BUERGER |ZOHAR | BUERGER | Z | | 889 | D | 91 | D |
--+----------------+----------+---------------------+------------------------+-----------------+------------+--------------+