Secuencia de comandos de Hive de ejemplo

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.

CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Transactional Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION transactionalMatch as 'com.pb.bdq.amm.process.hive.transactional.TransactionalMatchUDAF';

-- Transactional Match is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time and generates the result for that group of rows.

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'pb.bdq.match.rule'
set pb.bdq.match.rule={"type":"Parent", "children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"KeyboardDistance","weight":0,"options":null},
{"name":"Metaphone3","weight":0,"options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}], "matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks", "name":"NameData", "matchWhenNotTrue":false, "threshold":100, "weight":0};

-- Set header(along with id field alias used in query) using configuration property 'pb.bdq.match.header'
set pb.bdq.match.header=name,firstname,lastname,matchkey,middlename,recordid,id;

-- Set sort field name to the alias used in the query, using the configuration property 'pb.bdq.match.sort.field'
set pb.bdq.match.sort.field=id;

-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.candidate.return'. The default value is false.
set pb.bdq.match.unique.candidate.return=true;

-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.
-- Transactional Match returns a list of map containing <key=value> pairs. Each map in the list corresponds to a row in the group. The below query explodes that list of map and fetches fields from map by keys.

SELECT tmp2.record["MatchRecordType"],
	tmp2.record["MatchScore"],
	tmp2.record["HasDuplicate"],
	tmp2.record["name"],
	tmp2.record["firstname"],
	tmp2.record["lastname"],
	tmp2.record["matchkey"],
	tmp2.record["middlename"],
	tmp2.record["recordid"]
FROM (
	SELECT  transactionalMatch(innerRowID.name, innerRowID.firstname, innerRowID.lastname, innerRowID.matchkey, innerRowID.middlename, innerRowID.recordid, innerRowID.id
	) AS matchgroup  
	FROM (
		SELECT name, firstname, lastname, matchkey, middlename, recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id FROM customer_data
		) innerRowID 
	GROUP BY matchkey 
) As innerResult 
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;


-- Query to dump output to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/transmatch/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT tmp2.record["MatchRecordType"],
	tmp2.record["MatchScore"],
	tmp2.record["HasDuplicate"],
	tmp2.record["name"],
	tmp2.record["firstname"],
	tmp2.record["lastname"],
	tmp2.record["matchkey"],
	tmp2.record["middlename"],
	tmp2.record["recordid"]
FROM (
	SELECT  transactionalMatch(innerRowID.name, 
			innerRowID.firstname, 
			innerRowID.lastname,
			innerRowID.matchkey,
			innerRowID.middlename,
			innerRowID.recordid,
			innerRowID.id) as matchgroup  
	FROM (
		SELECT name, firstname, lastname, matchkey, middlename, recordid, rowid(name, firstname, lastname, matchkey, middlename, recordid) AS id 
		FROM customer_data
		) innerRowID 
	GROUP BY matchkey ) As innerResult 
LATERAL VIEW explode(innerResult.matchgroup) tmp2 as record ;



--sample input data
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| name              | firstname              | lastname              | matchkey              | middlename              | recordid              |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+
--| ZORINA  ABDOOL    | ZORINA                 | ABDOOL                | Z                     |                         | 12                    |
--| ZULFIQAR  ALI     | ZULFIQAR               | ALI                   | Z                     |                         | 116                   |
--| ZACHARY  BENNETT  | ZACHARY                | BENNETT               | Z                     |                         | 515                   |
--| ZOHAR  BUERGER    | ZOHAR                  | BUERGER               | Z                     |                         | 889                   |
--+-------------------+------------------------+-----------------------+-----------------------+-------------------------+-----------------------+--+

--sample output data
--+----------------+---------------------+----------+------------+-----------+-----------------+------------+--------------+
--|name            |firstname | lastname | matchkey | middlename | recordid  | MatchRecordType | MatchScore | HasDuplicate |
--+----------------+----------+----------+----------+------------+-----------+-----------------+------------+--------------+
--|ZORINA  ABDOOL  |ZORINA    | ABDOOL   | Z        |            | 12        |		S		   |	0		|		Y 	   |
--|ZULFIQAR  ALI   |ZULFIQAR  | ALI      | Z        |            | 116       |      D          |	90		|		D	   |
--|ZACHARY  BENNETT|ZACHARY   | BENNETT  | Z        |            | 515       |      D          |	91		|		D	   |
--|ZOHAR  BUERGER  |ZOHAR     | BUERGER  | Z        |            | 889       |		D		   |	91		|		D	   |
--+----------------+----------+---------------------+------------------------+-----------------+------------+--------------+