サンプル Hive スクリプト

-- Register Advance Matching Module[AMM] Hive UDF jar
ADD JAR <Directory path>/amm.hive.${project.version}.jar;

-- Provide alias to UDF class (optional). String in quotes represent class names needed for this job to run.
CREATE TEMPORARY FUNCTION rowid as 'com.pb.bdq.hive.common.RowIDGeneratorUDF';

-- This rowid is needed by Intraflow Match to maintain the order of rows while creating groups. This is a UDF (User Defined Function) and associates an incremental unique integer number to each row of the data.

CREATE TEMPORARY FUNCTION intraMatch as 'com.pb.bdq.amm.process.hive.intraflow.IntraMatchUDAF';
-- Intra Flow is implemented as a UDAF (User Defined Aggregation function). It processes one group of rows at a time and generates the result for that group of rows

-- Disable map side aggregation 
set hive.map.aggr = false;

-- Set the rule using configuration property 'pb.bdq.match.rule'
set pb.bdq.match.rule={"type":"Parent",
"children":[{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0,
"algorithms":[{"name":"EditDistance", "weight":0, "options":null},
{"name":"Metaphone", "weight":0, "options":null}],
"scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"firstname", "candidateField":null},
{"type":"Child", "matchWhenNotTrue":false, "threshold":80.0, "weight":0, "algorithms":[{"name":"KeyboardDistance", "weight":0, "options":null},
{"name":"Metaphone3", "weight":0, "options":null}], "scoringMethod":"Maximum", "missingDataMethod":"IgnoreBlanks", "crossMatchField":[], "suspectField":"lastname", "candidateField":null}],
"matchingMethod":"AllTrue", "scoringMethod":"Average", "missingDataMethod":"IgnoreBlanks", "name":"NameData", "matchWhenNotTrue":false, "threshold":100,"weight":0};


-- Set header(along with id field alias used in query) using configuration property 'pb.bdq.match.header'
set pb.bdq.match.header=firstname,lastname,matchkey,middlename,id;

-- Set the express match column (optional)
set pb.bdq.match.express.column=matchkey;

-- Set sort field name to the alias used in the query, using the configuration property 'pb.bdq.match.sort.field'
set pb.bdq.match.sort.field=id;

-- Set sort collection number option for unique records using configuration property 'pb.bdq.match.unique.collectnumber.zero'
set pb.bdq.match.unique.collectnumber.zero=false;

-- Execute Query on the desired table. The query uses a UDF rowid, which must be present in the query to maintain the ordering of the data while reading.
-- Intra Match returns a list of map containing <key=value> pairs. Each map in the list corresponds to a row in the group. The below query explodes that list of map and fetches fields from map by keys.

SELECT innerresult.record["MatchRecordType"],
	innerresult.record["MatchScore"],
	innerresult.record["CollectionNumber"],
	innerresult.record["ExpressMatched"],
	innerresult.record["firstname"],
	innerresult.record["lastname"],
	innerresult.record["matchkey"],
	innerresult.record["middlename"]
	FROM (
		SELECT  intraMatch( 
				innerRowID.firstname, 
				innerRowID.lastname, 
				innerRowID.matchkey, 
				innerRowID.middlename, 
				innerRowID.id
		) AS matchgroup  
		FROM (
			SELECT  firstname, lastname, matchkey, middlename, rowid(*) 
			AS id 
			FROM customer_data
			) innerRowID 
		GROUP BY matchkey 
	) AS innerResult 
	LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;

-- Query to dump output to a file

INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop/IntraFlow/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' collection items terminated by '||' map keys terminated by ':'
SELECT innerresult.record["MatchRecordType"],
	innerresult.record["MatchScore"],
	innerresult.record["CollectionNumber"],
	innerresult.record["ExpressMatched"],
	innerresult.record["firstname"],
	innerresult.record["lastname"],
	innerresult.record["matchkey"],
	innerresult.record["middlename"]
	FROM (
		SELECT  intraMatch(innerRowID.firstname, 
				innerRowID.lastname, 
				innerRowID.matchkey, 
				innerRowID.middlename, 
				innerRowID.id
		) AS matchgroup 
		FROM (
				SELECT  firstname, lastname, matchkey, middlename, rowid(*) 
				AS id 
				FROM customer_data
			) innerRowID 
		GROUP BY matchkey 
		) AS innerResult 
	LATERAL VIEW explode(innerResult.matchgroup) innerresult AS record ;


--sample input data
--+-------------+-------------+---------------+--------------+
--| firstname   | lastname    | middlename    | matchkey     |
--+-------------+-------------+---------------+--------------+
--| Steven      | Aaen        | LYRIC         | AAE          |
--| DEBRA       | AALMO       | BOATMAN       | AAE          |              
--| MARY        | AARON       | ROLLING MEADOW| AAE          |
--+-------------+-------------+---------------+--------------+

--sample output data
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| firstname | lastname|middlename  | matchkey|MatchRecordType|CollectionNumber|ExpressMatched|MatchScore| 
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+
--| Steven    | Aaen    | LYRIC      | AAE     |	S	       |         0-0-1  |     Y        |     0    |
--| DEBRA     | AALMO   | BOATMAN    | AAE     |    D 	       |         0-0-1  |     Y        |    100   | 
--| MARY      | AARON   | ROLLING MEA| AAE     |    D  	       |         0-0-1  |     Y        |    100   |
--+-----------+---------+------------+---------+---------------+----------------+--------------+----------+