-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;
-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';
-- This rowid is needed by Intraflow Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.
CREATE TEMPORARY FUNCTION intraMatch as 'com.pb.bdq.amm.process.hive.intraflow.IntraMatchUDAF';
-- Intra Flow is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time and generates the result for that group of rows
-- Disable map side aggregation
set hive.map.aggr = false;
-- Set the rule using configuration property 'pb.bdq.match.rule'
set pb.bdq.match.rule={"type":"Parent",
"children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0,
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
"matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks", "name":"NameData", "matchWhenNotTrue":false, "threshold":100,"weight":0};
-- Set header(along with id field alias used in query) using configuration property 'pb.bdq.match.header'
set pb.bdq.match.header=firstname,lastname,matchkey,middlename,id;
-- Set the express match column (optional)
set pb.bdq.match.express.column=matchkey;
-- Set sort field name to the alias used in the query, using the configuration property 'pb.bdq.match.sort.field'
set pb.bdq.match.sort.field=id;
-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.collectnumber.zero'
set pb.bdq.match.unique.collectnumber.zero=false;
-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.
-- Intra Match returns a list of map containing <key=value> pairs. Each map in the list corresponds to a row in the group. The below query explodes that list of map and fetches fields from map by keys.
SELECT innerresult.record["MatchRecordType"],
innerresult.record["MatchScore"],
innerresult.record["CollectionNumber"],
innerresult.record["ExpressMatched"],
innerresult.record["firstname"],
innerresult.record["lastname"],
innerresult.record["matchkey"],
innerresult.record["middlename"]
FROM (
SELECT intraMatch(
innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.id
) AS matchgroup
FROM (
SELECT firstname, lastname, matchkey, middlename, rowid(*)
AS id
FROM customer_data
) innerRowID
GROUP BY matchkey
) AS innerResult
LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;
-- Query to dump output to a file
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/IntraFlow/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record["MatchRecordType"],
innerresult.record["MatchScore"],
innerresult.record["CollectionNumber"],
innerresult.record["ExpressMatched"],
innerresult.record["firstname"],
innerresult.record["lastname"],
innerresult.record["matchkey"],
innerresult.record["middlename"]
FROM (
SELECT intraMatch(innerRowID.firstname,
innerRowID.lastname,
innerRowID.matchkey,
innerRowID.middlename,
innerRowID.id
) AS matchgroup
FROM (
SELECT firstname, lastname, matchkey, middlename, rowid(*)
AS id
FROM customer_data
) innerRowID
GROUP BY matchkey
) AS innerResult
LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;
--sample input data
--+-------------+-------------+---------------+--------------+
--| firstname | lastname | middlename | matchkey |
--+-------------+-------------+---------------+--------------+
--| Steven | Aaen | LYRIC | AAE |
--| DEBRA | AALMO | BOATMAN | AAE |
--| MARY | AARON | ROLLING MEADOW| AAE |
--+-------------+-------------+---------------+--------------+
--sample output data
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| firstname | lastname|middlename | matchkey|MatchRecordType|CollectionNumber|ExpressMatched|MatchScore|
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| Steven | Aaen | LYRIC | AAE | S | 0-0-1 | Y | 0 |
--| DEBRA | AALMO | BOATMAN | AAE | D | 0-0-1 | Y | 100 |
--| MARY | AARON | ROLLING MEA| AAE | D | 0-0-1 | Y | 100 |
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+