forked from jblocher/sas_util
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompany_macro.sas
359 lines (300 loc) · 13.4 KB
/
company_macro.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
/* *************************************************************************/
/* CREATED BY: Joey Engelberg (UNC-Chapel Hill)
/* MODIFIED BY:
/* DATE CREATED: ??
/* PROG NAME: company_macro(rev).sas
/* Project:
/* This File: Matches two datasets of Company Names
/************************************************************************************/
****Company Name Matching Macro;
****The purpose of the following macro is to 'match' the company names from two different datasets
The macro works as follows:
1) Separates every company (from both datasets) into words
2) Eliminates common company words like CO, LLP, CORP, etc.
3) Joins together the remaining words after the common words have been filtered out
4) Compares each consecutive pair of letters from each company in dataset1 with each company in dataset2
to determine a score. The score represents the fraction of letter pairs that matched
5) Organizes the results to show the best match for each company;
%MACRO mfmatch(dataset1=,var1=,dataset2=,var2=,out_ds=); /*** dataset 1 and 2 are the two SAS datasets with the companies you wish to match***/;
/*** var 1 and 2 are the two variables within the datasets which have the name of the companies***/;
/** out_ds is the dataset with the vars and matches **/
Data Dummy1;
Set &dataset1 NOBS=nobs1; /***setting first dataset and finding number of observations***/;
CALL SYMPUT("OBS",nobs1); /***keeping the number of observations as a macrovariable called OBS ***/;
Attrib &var1 CompA length=$200.;
Comp1=UPCASE(&var1); /***making the company names all uppercase***/;
Comp1=TRANWRD(Comp1,' INTL',' INTERNATIONAL');
Comp1=TRANWRD(Comp1,' LTD',' LIMITED');
Comp1=TRANWRD(Comp1,'CL A','');
Comp1=TRANWRD(Comp1,'CL B','');
Comp1=TRANWRD(Comp1,'CL C','');
Comp1=TRANWRD(Comp1,'-ADR','');
Comp1=TRANWRD(Comp1,'-LP','');
Comp1=TRANWRD(Comp1,'-SP','');
Comp1=TRANWRD(Comp1,' ADR','');
Comp1=TRANWRD(Comp1,'-REDH','');
Comp1=TRANWRD(Comp1,'-SPON','');
Comp1=TRANWRD(Comp1,'-OLD','');
Comp1=TRANWRD(Comp1,' (NEW)','');
Comp1=TRANWRD(Comp1,'-REDH','');
Comp1=TRANWRD(Comp1,'/NEW','');
Comp1=TRANWRD(Comp1,' L P','');
Comp1=TRANWRD(Comp1,'/PRED','');
Comp1=TRANWRD(Comp1,'/OLD','');
Comp1=TRANWRD(Comp1,'/AL','');
Comp1=TRANWRD(Comp1,'/AK','');
Comp1=TRANWRD(Comp1,'/AZ','');
Comp1=TRANWRD(Comp1,'/AR','');
Comp1=TRANWRD(Comp1,'/CA','');
Comp1=TRANWRD(Comp1,'/CO','');
Comp1=TRANWRD(Comp1,'/CT','');
Comp1=TRANWRD(Comp1,'/DE','');
Comp1=TRANWRD(Comp1,'/DC','');
Comp1=TRANWRD(Comp1,'/FL','');
Comp1=TRANWRD(Comp1,'/FLA','');
Comp1=TRANWRD(Comp1,'/GA','');
Comp1=TRANWRD(Comp1,'/HI','');
Comp1=TRANWRD(Comp1,'/IA','');
Comp1=TRANWRD(Comp1,'/ID','');
Comp1=TRANWRD(Comp1,'/IL','');
Comp1=TRANWRD(Comp1,'/IN','');
Comp1=TRANWRD(Comp1,'/KS','');
Comp1=TRANWRD(Comp1,'/KY','');
Comp1=TRANWRD(Comp1,'/LA','');
Comp1=TRANWRD(Comp1,'/MA','');
Comp1=TRANWRD(Comp1,'/MD','');
Comp1=TRANWRD(Comp1,'/ME','');
Comp1=TRANWRD(Comp1,'/MI','');
Comp1=TRANWRD(Comp1,'/MN','');
Comp1=TRANWRD(Comp1,'/MO','');
Comp1=TRANWRD(Comp1,'/MS','');
Comp1=TRANWRD(Comp1,'/MT','');
Comp1=TRANWRD(Comp1,'/NC','');
Comp1=TRANWRD(Comp1,'/ND','');
Comp1=TRANWRD(Comp1,'/NE','');
Comp1=TRANWRD(Comp1,'/NH','');
Comp1=TRANWRD(Comp1,'/NJ','');
Comp1=TRANWRD(Comp1,'/NM','');
Comp1=TRANWRD(Comp1,'/NV','');
Comp1=TRANWRD(Comp1,'/NY','');
Comp1=TRANWRD(Comp1,'/OH','');
Comp1=TRANWRD(Comp1,'/OK','');
Comp1=TRANWRD(Comp1,'/OR','');
Comp1=TRANWRD(Comp1,'/PA','');
Comp1=TRANWRD(Comp1,'/RI','');
Comp1=TRANWRD(Comp1,'/SC','');
Comp1=TRANWRD(Comp1,'/SD','');
Comp1=TRANWRD(Comp1,'/TN','');
Comp1=TRANWRD(Comp1,'/TX','');
Comp1=TRANWRD(Comp1,'/UT','');
Comp1=TRANWRD(Comp1,'/VA','');
Comp1=TRANWRD(Comp1,'/VT','');
Comp1=TRANWRD(Comp1,'/WA','');
Comp1=TRANWRD(Comp1,'/WV','');
Comp1=TRANWRD(Comp1,'/WI','');
Comp1=TRANWRD(Comp1,'/WY','');
text1=scan(Comp1,1,' ,'); /***separating the company names into words (14 is the maximum number of words)***/;
text2=scan(Comp1,2,' ,');
text3=scan(Comp1,3,' ,');
text4=scan(Comp1,4,' ,');
text5=scan(Comp1,5,' ,');
text6=scan(Comp1,6,' ,');
text7=scan(Comp1,7,' ,');
text8=scan(Comp1,8,' ,');
text9=scan(Comp1,9,' ,');
text10=scan(Comp1,10,' ,');
text11=scan(Comp1,11,' ,');
text12=scan(Comp1,12,' ,');
text13=scan(Comp1,13,' ,');
text14=scan(Comp1,14,' ,'); /***separating the company names into words (14 is the maximum number of words)***/;
array text (14) text1 text2 text3 text4 text5 text6 text7
text8 text9 text10 text11 text12 text13 text14;
Do i=1 to 14;
If text(i)='INC.' THEN text(i)=''; /***getting rid of common words that add no value to the comparisons***/;
If text(i)='INC' THEN text(i)='';
If text(i)='CO.' THEN text(i)='';
If text(i)='CO' THEN text(i)='';
If text(i)='INC.' THEN text(i)='';
If text(i)='INC' THEN text(i)='';
If text(i)='CORP' THEN text(i)='';
If text(i)='CORP.' THEN text(i)='';
If text(i)='LTD' THEN text(i)='';
If text(i)='LTD.' THEN text(i)='';
If text(i)='LP' THEN text(i)='';
If text(i)='LP.' THEN text(i)='';
If text(i)='LLC' THEN text(i)='';
If text(i)='LLC.' THEN text(i)=''; /***getting rid of common words that add no value to the comparisons***/;
End;
CompA=compress(trim(text1)||trim(text2)||trim(text3)||trim(text4)||
trim(text5)||trim(text6)||trim(text7)||trim(text8)
||trim(text9) ||trim(text10)||trim(text11)||trim(text12)||
trim(text13)||trim(text14),' ,-/'); /***joining the remaining words (after the commom ones have been dropped), compressing them and elminating commas, spaces and dashes***/;
lenA=length(trim(CompA)); /***determining the length of each compressed company***/;
FirstLetterA=substr(CompA,1,1); /***determining the first letter of each company***/;
recno=_n_; /***assigning a number to each observation***/;
Drop i Comp1 text1 text2 text3 text4 text5 text6 text7 text8
text9 text10 text11 text12 text13 text14; /***dropping unncessary variables***/;
Data Dummy2;
Set &dataset2;
Attrib &var2 CompB length=$200.;
Comp2=UPCASE(&var2); /***making the company names all uppercase***/;
Comp2=TRANWRD(Comp2,' INTL',' INTERNATIONAL');
Comp2=TRANWRD(Comp2,' LTD',' LIMITED');
Comp2=TRANWRD(Comp2,'CL A','');
Comp2=TRANWRD(Comp2,'CL B','');
Comp2=TRANWRD(Comp2,'CL C','');
Comp2=TRANWRD(Comp2,'-ADR','');
Comp2=TRANWRD(Comp2,'-LP','');
Comp2=TRANWRD(Comp2,'-SP','');
Comp2=TRANWRD(Comp2,' ADR','');
Comp2=TRANWRD(Comp2,'-REDH','');
Comp2=TRANWRD(Comp2,'-SPON','');
Comp2=TRANWRD(Comp2,'-OLD','');
Comp2=TRANWRD(Comp2,' (NEW)','');
Comp2=TRANWRD(Comp2,'-REDH','');
Comp2=TRANWRD(Comp2,'/NEW','');
Comp2=TRANWRD(Comp2,' L P','');
Comp2=TRANWRD(Comp2,'/PRED','');
Comp2=TRANWRD(Comp2,'/OLD','');
Comp2=TRANWRD(Comp2,'/AL','');
Comp2=TRANWRD(Comp2,'/AK','');
Comp2=TRANWRD(Comp2,'/AZ','');
Comp2=TRANWRD(Comp2,'/AR','');
Comp2=TRANWRD(Comp2,'/CA','');
Comp2=TRANWRD(Comp2,'/CO','');
Comp2=TRANWRD(Comp2,'/CT','');
Comp2=TRANWRD(Comp2,'/DE','');
Comp2=TRANWRD(Comp2,'/DC','');
Comp2=TRANWRD(Comp2,'/FL','');
Comp2=TRANWRD(Comp2,'/FLA','');
Comp2=TRANWRD(Comp2,'/GA','');
Comp2=TRANWRD(Comp2,'/HI','');
Comp2=TRANWRD(Comp2,'/IA','');
Comp2=TRANWRD(Comp2,'/ID','');
Comp2=TRANWRD(Comp2,'/IL','');
Comp2=TRANWRD(Comp2,'/IN','');
Comp2=TRANWRD(Comp2,'/KS','');
Comp2=TRANWRD(Comp2,'/KY','');
Comp2=TRANWRD(Comp2,'/LA','');
Comp2=TRANWRD(Comp2,'/MA','');
Comp2=TRANWRD(Comp2,'/MD','');
Comp2=TRANWRD(Comp2,'/ME','');
Comp2=TRANWRD(Comp2,'/MI','');
Comp2=TRANWRD(Comp2,'/MN','');
Comp2=TRANWRD(Comp2,'/MO','');
Comp2=TRANWRD(Comp2,'/MS','');
Comp2=TRANWRD(Comp2,'/MT','');
Comp2=TRANWRD(Comp2,'/NC','');
Comp2=TRANWRD(Comp2,'/ND','');
Comp2=TRANWRD(Comp2,'/NE','');
Comp2=TRANWRD(Comp2,'/NH','');
Comp2=TRANWRD(Comp2,'/NJ','');
Comp2=TRANWRD(Comp2,'/NM','');
Comp2=TRANWRD(Comp2,'/NV','');
Comp2=TRANWRD(Comp2,'/NY','');
Comp2=TRANWRD(Comp2,'/OH','');
Comp2=TRANWRD(Comp2,'/OK','');
Comp2=TRANWRD(Comp2,'/OR','');
Comp2=TRANWRD(Comp2,'/PA','');
Comp2=TRANWRD(Comp2,'/RI','');
Comp2=TRANWRD(Comp2,'/SC','');
Comp2=TRANWRD(Comp2,'/SD','');
Comp2=TRANWRD(Comp2,'/TN','');
Comp2=TRANWRD(Comp2,'/TX','');
Comp2=TRANWRD(Comp2,'/UT','');
Comp2=TRANWRD(Comp2,'/VA','');
Comp2=TRANWRD(Comp2,'/VT','');
Comp2=TRANWRD(Comp2,'/WA','');
Comp2=TRANWRD(Comp2,'/WV','');
Comp2=TRANWRD(Comp2,'/WI','');
Comp2=TRANWRD(Comp2,'/WY','');
text1=scan(Comp2,1,' ,'); /***separating the company names into words (14 is the maximum number of words)***/;
text2=scan(Comp2,2,' ,');
text3=scan(Comp2,3,' ,');
text4=scan(Comp2,4,' ,');
text5=scan(Comp2,5,' ,');
text6=scan(Comp2,6,' ,');
text7=scan(Comp2,7,' ,');
text8=scan(Comp2,8,' ,');
text9=scan(Comp2,9,' ,');
text10=scan(Comp2,10,' ,');
text11=scan(Comp2,11,' ,');
text12=scan(Comp2,12,' ,');
text13=scan(Comp2,13,' ,');
text14=scan(Comp2,14,' ,'); /***separating the company names into words (14 is the maximum number of words)***/;
array text (14) text1 text2 text3 text4 text5 text6 text7 text8
text9 text10 text11 text12 text13 text14;
Do i=1 to 14;
If text(i)='INC.' THEN text(i)=''; /***getting rid of common words that add no value to the comparisons***/;
If text(i)='INC' THEN text(i)='';
If text(i)='CO.' THEN text(i)='';
If text(i)='CO' THEN text(i)='';
If text(i)='INC.' THEN text(i)='';
If text(i)='INC' THEN text(i)='';
If text(i)='CORP' THEN text(i)='';
If text(i)='CORP.' THEN text(i)='';
If text(i)='LTD' THEN text(i)='';
If text(i)='LTD.' THEN text(i)='';
If text(i)='LP' THEN text(i)='';
If text(i)='LP.' THEN text(i)='';
If text(i)='LLC' THEN text(i)='';
If text(i)='LLC.' THEN text(i)=''; /***getting rid of common words that add no value to the comparisons***/;
End;
CompB=compress(trim(text1)||trim(text2)||trim(text3)||trim(text4)||
trim(text5) ||trim(text6)||trim(text7)||trim(text8) ||trim(text9) ||
trim(text10) ||trim(text11)||trim(text12)||trim(text13)||trim(text14),' ,-'); /***joining the remaining words (after the commom ones have been dropped), compressing them and elminating commas, spaces and dashes***/;
lenB=length(trim(CompB)); /***determining the length of each compressed company***/;
FirstLetterB=substr(CompB,1,1); /***determining the first letter of each company***/;
Drop i Comp2 text1 text2 text3 text4 text5 text6 text7 text8
text9 text10 text11 text12 text13 text14; /***dropping unncessary variables***/;
Data &out_ds; /***creating any empty dataset called cumulative will eventually accumulate the final results of the matching procedure***/;
Set Dummy1;
If CompA='';
%DO b=1 %TO &OBS; /***this is the macro do statement which loops through each observation***/;
Data Dummy3;
Set Dummy1;
If recno=&b; /***this subsetting statment takes the b observation only (recall from above that b is the variable that runs through each observation of dataset1)***/;
DATA compare1;
SET Dummy3;
DO k=1 TO nobs2;
SET Dummy2 NOBS=nobs2 POINT=k; /***this command requests each observation one at a time from dataset2***/;
If trim(FirstLetterA)=trim(FirstLetterB) Then do; /***runs the analysis only for companies who begin with the same first letter***/;
Score1=0;
Score2=0;
Do m=1 to lenA; /***repeats the analysis for each pair of letters (every compressed word has lenA-1 pairs)***/;
If m<=lenA-1 Then do;
pair=substr(CompA,m,2); /***determining letter pair***/;
If index(CompB,trim(pair))^=0 then score1=score1+1; /***increasing score if letter pair matches within selected company in dataset2***/;
end;
end;
/***repeats the analysis except now pairs from the selected company in dataset2 are compared with the selected company in dataset2***/;
Do n=1 to lenB;
If n<=lenB-1 Then do;
pair=substr(CompB,n,2);
If index(CompA,trim(pair))^=0 then score2=score2+1;
end;
end;
MatchPercent1=score1/(lenA-1); /***calculates a percent of pairs (from the selected company in dataset1) that matched***/;
MatchPercent2=score2/(lenB-1); /***calculates a percent of pairs (from the selected company in dataset2) that matched***/;
TotalScore=MatchPercent1*MatchPercent2; /***calculates a total score by multiplying the two percents together; clearly a total score of 1 is a perfect match. A score of .7 and higher is a good match.***/;
If TotalScore>.3 THEN OUTPUT; /***outputs only those matches that have a TotalScore greater than .3 - typically matches less than .3 are bad matches***/;
end;
END;
Stop;
DROP pair m n; /***drop unneeded variables***/;
PROC Sort Data=compare1 Out=compare1;
By &var1 &var2;
Data compare3;
set compare1;
By &var1 &var2;
PROC SORT Data=Compare3 Out=Compare3;
By &var1 TotalScore;
Data compare4;
Set compare3;
By &var1;
If Last.&var1=1; /***taking only best match***/;
Data &out_ds;
Set &out_ds Compare4; /***accumulates the best matches in the cumulative dataset***/;
drop recno score1 score2 lenA lenB MatchPercent1 MatchPercent2;
%End;
%MEND mfmatch;