- matcher.service.ts: name/address normalization, Levenshtein distance, geocoding proximity, confidence scoring (0.0-1.0) - change-detector.ts: PropertySnapshot diff engine, severity scoring (minor/moderate/major), configurable thresholds, alert triggering - 57 unit tests with 98%+ coverage across all thresholds
310 lines
10 KiB
TypeScript
310 lines
10 KiB
TypeScript
import {
|
|
Address,
|
|
MatchResult,
|
|
MatchDetails,
|
|
FieldMatch,
|
|
MatchingConfig,
|
|
NormalizedTokens,
|
|
PropertyType,
|
|
} from './types';
|
|
|
|
const DEFAULT_CONFIG: MatchingConfig = {
|
|
nameThreshold: 0.85,
|
|
addressThreshold: 0.9,
|
|
overallThreshold: 0.85,
|
|
geocodingRadiusMeters: 100,
|
|
};
|
|
|
|
const COMMON_PREFIXES = new Set([
|
|
'mr', 'mrs', 'ms', 'miss', 'dr', 'prof', 'jr', 'sr', 'junior', 'senior',
|
|
'ii', 'iii', 'iv', 'rev', 'st', 'hon', 'esq',
|
|
]);
|
|
|
|
const COMMON_SUFFIXES = new Set([
|
|
'jr', 'sr', 'junior', 'senior', 'ii', 'iii', 'iv', 'v', 'esq',
|
|
'phd', 'md', 'llm', 'cpa',
|
|
]);
|
|
|
|
const STREET_TYPE_MAP: Record<string, string> = {
|
|
'st': 'street', 'street': 'street',
|
|
'ave': 'avenue', 'avenue': 'avenue',
|
|
'blvd': 'boulevard', 'boulevard': 'boulevard',
|
|
'dr': 'drive', 'drive': 'drive',
|
|
'ln': 'lane', 'lane': 'lane',
|
|
'ct': 'court', 'court': 'court',
|
|
'pl': 'place', 'place': 'place',
|
|
'rd': 'road', 'road': 'road',
|
|
'way': 'way',
|
|
'trl': 'trail', 'trail': 'trail',
|
|
'hwy': 'highway', 'highway': 'highway',
|
|
'pkwy': 'parkway', 'parkway': 'parkway',
|
|
'cir': 'circle', 'circle': 'circle',
|
|
'sq': 'square', 'square': 'square',
|
|
'ter': 'terrace', 'terrace': 'terrace',
|
|
};
|
|
|
|
const PROPERTY_TYPE_CONFIGS: Record<PropertyType, Partial<MatchingConfig>> = {
|
|
'residential': { nameThreshold: 0.85, addressThreshold: 0.9 },
|
|
'commercial': { nameThreshold: 0.8, addressThreshold: 0.9 },
|
|
'land': { nameThreshold: 0.8, addressThreshold: 0.85 },
|
|
'multi-family': { nameThreshold: 0.8, addressThreshold: 0.9 },
|
|
};
|
|
|
|
function levenshteinDistance(a: string, b: string): number {
|
|
const matrix: number[][] = Array.from({ length: b.length + 1 }, (_, i) =>
|
|
Array.from({ length: a.length + 1 }, (_, j) => (i === 0 ? j : j === 0 ? i : 0))
|
|
);
|
|
|
|
for (let i = 1; i <= b.length; i++) {
|
|
for (let j = 1; j <= a.length; j++) {
|
|
const cost = a[j - 1] === b[i - 1] ? 0 : 1;
|
|
matrix[i][j] = Math.min(
|
|
matrix[i - 1][j] + 1,
|
|
matrix[i][j - 1] + 1,
|
|
matrix[i - 1][j - 1] + cost,
|
|
);
|
|
}
|
|
}
|
|
|
|
return matrix[b.length][a.length];
|
|
}
|
|
|
|
function similarityScore(distance: number, maxLen: number): number {
|
|
if (maxLen === 0) return 1.0;
|
|
return 1.0 - distance / maxLen;
|
|
}
|
|
|
|
function normalizeString(str: string): string {
|
|
return str
|
|
.toLowerCase()
|
|
.replace(/[''']/g, '')
|
|
.replace(/[^a-z0-9\s]/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
function parseName(name: string): NormalizedTokens {
|
|
const clean = normalizeString(name);
|
|
const parts = clean.split(' ').filter(Boolean);
|
|
|
|
let firstName = '';
|
|
let lastName = '';
|
|
let middleName = '';
|
|
const initials: string[] = [];
|
|
|
|
if (parts.length === 0) return { firstName, lastName, middleName, initials };
|
|
|
|
let startIdx = 0;
|
|
while (startIdx < parts.length && COMMON_PREFIXES.has(parts[startIdx])) {
|
|
startIdx++;
|
|
}
|
|
|
|
let endIdx = parts.length;
|
|
while (endIdx > startIdx + 1 && COMMON_SUFFIXES.has(parts[endIdx - 1])) {
|
|
endIdx--;
|
|
}
|
|
|
|
const coreParts = parts.slice(startIdx, endIdx);
|
|
|
|
if (coreParts.length === 1) {
|
|
lastName = coreParts[0];
|
|
} else if (coreParts.length === 2) {
|
|
firstName = coreParts[0];
|
|
lastName = coreParts[1];
|
|
} else {
|
|
firstName = coreParts[0];
|
|
lastName = coreParts[coreParts.length - 1];
|
|
middleName = coreParts.slice(1, -1).join(' ');
|
|
}
|
|
|
|
if (firstName.length === 1) {
|
|
initials.push(firstName);
|
|
}
|
|
if (middleName) {
|
|
const middleParts = middleName.split(' ');
|
|
for (const mp of middleParts) {
|
|
if (mp.length === 1) initials.push(mp);
|
|
}
|
|
}
|
|
|
|
return { firstName, lastName, middleName, initials };
|
|
}
|
|
|
|
function normalizeStreetType(type: string): string {
|
|
const clean = normalizeString(type);
|
|
return STREET_TYPE_MAP[clean] || clean;
|
|
}
|
|
|
|
function normalizeAddress(addr: Address): string {
|
|
const parts = [
|
|
addr.streetNumber,
|
|
normalizeString(addr.streetName),
|
|
addr.streetType ? normalizeStreetType(addr.streetType) : '',
|
|
addr.unit ? normalizeString(addr.unit) : '',
|
|
normalizeString(addr.city),
|
|
addr.state.toLowerCase(),
|
|
addr.zip,
|
|
].filter(Boolean);
|
|
return parts.join(' ');
|
|
}
|
|
|
|
function computeFieldMatch(valueA: string, valueB: string, normalizeFn?: (v: string) => string): FieldMatch {
|
|
const normFn = normalizeFn || normalizeString;
|
|
const normalizedA = normFn(valueA);
|
|
const normalizedB = normFn(valueB);
|
|
|
|
if (!normalizedA && !normalizedB) return { valueA, valueB, normalizedA, normalizedB, score: 1.0 };
|
|
if (!normalizedA || !normalizedB) return { valueA, valueB, normalizedA, normalizedB, score: 0.0 };
|
|
|
|
if (normalizedA === normalizedB) return { valueA, valueB, normalizedA, normalizedB, score: 1.0 };
|
|
|
|
const dist = levenshteinDistance(normalizedA, normalizedB);
|
|
const maxLen = Math.max(normalizedA.length, normalizedB.length);
|
|
const score = similarityScore(dist, maxLen);
|
|
|
|
return { valueA, valueB, normalizedA, normalizedB, score: Math.round(score * 1000) / 1000 };
|
|
}
|
|
|
|
function haversineDistance(lat1: number, lon1: number, lat2: number, lon2: number): number {
|
|
const R = 6371000;
|
|
const dLat = ((lat2 - lat1) * Math.PI) / 180;
|
|
const dLon = ((lon2 - lon1) * Math.PI) / 180;
|
|
const a =
|
|
Math.sin(dLat / 2) * Math.sin(dLat / 2) +
|
|
Math.cos((lat1 * Math.PI) / 180) *
|
|
Math.cos((lat2 * Math.PI) / 180) *
|
|
Math.sin(dLon / 2) *
|
|
Math.sin(dLon / 2);
|
|
const c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a));
|
|
return R * c;
|
|
}
|
|
|
|
function computeNameScore(tokensA: NormalizedTokens, tokensB: NormalizedTokens): number {
|
|
const firstScore = computeFieldMatch(tokensA.firstName, tokensB.firstName).score;
|
|
const lastScore = computeFieldMatch(tokensA.lastName, tokensB.lastName).score;
|
|
const middleScore = computeFieldMatch(tokensA.middleName, tokensB.middleName).score;
|
|
|
|
let initialMatchScore = 1.0;
|
|
if (tokensA.initials.length > 0 || tokensB.initials.length > 0) {
|
|
const allInitialsA = new Set(tokensA.initials.map(i => i.toLowerCase()));
|
|
const allInitialsB = new Set(tokensB.initials.map(i => i.toLowerCase()));
|
|
let matched = 0;
|
|
for (const init of allInitialsA) {
|
|
if (allInitialsB.has(init)) matched++;
|
|
}
|
|
const total = Math.max(allInitialsA.size, allInitialsB.size);
|
|
initialMatchScore = total > 0 ? matched / total : 1.0;
|
|
}
|
|
|
|
const weighted = (lastScore * 0.45) + (firstScore * 0.35) + (middleScore * 0.1) + (initialMatchScore * 0.1);
|
|
return Math.round(weighted * 1000) / 1000;
|
|
}
|
|
|
|
function computeAddressScore(addrA: Address, addrB: Address, config: MatchingConfig): { score: number; geocodingDistance?: number } {
|
|
const numberMatch = computeFieldMatch(addrA.streetNumber, addrB.streetNumber).score;
|
|
const streetMatch = computeFieldMatch(addrA.streetName, addrB.streetName, normalizeString).score;
|
|
const typeMatch = computeFieldMatch(
|
|
addrA.streetType ? normalizeStreetType(addrA.streetType) : '',
|
|
addrB.streetType ? normalizeStreetType(addrB.streetType) : '',
|
|
).score;
|
|
const unitMatch = computeFieldMatch(addrA.unit || '', addrB.unit || '').score;
|
|
const cityMatch = computeFieldMatch(addrA.city, addrB.city).score;
|
|
const stateMatch = computeFieldMatch(addrA.state, addrB.state).score;
|
|
const zipMatch = computeFieldMatch(addrA.zip, addrB.zip).score;
|
|
|
|
let geocodingDistance: number | undefined;
|
|
let geoScore = 0.0;
|
|
|
|
if (addrA.latitude && addrA.longitude && addrB.latitude && addrB.longitude) {
|
|
geocodingDistance = haversineDistance(addrA.latitude, addrA.longitude, addrB.latitude, addrB.longitude);
|
|
const maxDist = config.geocodingRadiusMeters;
|
|
geoScore = geocodingDistance <= maxDist ? 1.0 : Math.max(0, 1.0 - (geocodingDistance - maxDist) / (maxDist * 5));
|
|
}
|
|
|
|
const weighted =
|
|
(numberMatch * 0.2) +
|
|
(streetMatch * 0.25) +
|
|
(typeMatch * 0.1) +
|
|
(unitMatch * 0.1) +
|
|
(cityMatch * 0.1) +
|
|
(stateMatch * 0.1) +
|
|
(zipMatch * 0.1) +
|
|
(geoScore * (geocodingDistance !== undefined ? 0.05 : 0));
|
|
|
|
return { score: Math.round(weighted * 1000) / 1000, geocodingDistance };
|
|
}
|
|
|
|
export function matchRecords(
|
|
nameA: string,
|
|
addressA: Address,
|
|
nameB: string,
|
|
addressB: Address,
|
|
config?: Partial<MatchingConfig>,
|
|
): MatchResult {
|
|
const effectiveConfig = { ...DEFAULT_CONFIG, ...config };
|
|
|
|
const tokensA = parseName(nameA);
|
|
const tokensB = parseName(nameB);
|
|
|
|
const nameScore = computeNameScore(tokensA, tokensB);
|
|
|
|
const { score: addressScore, geocodingDistance } = computeAddressScore(addressA, addressB, effectiveConfig);
|
|
|
|
const overallConfidence = Math.round((nameScore * 0.5 + addressScore * 0.5) * 1000) / 1000;
|
|
|
|
const firstMatch = computeFieldMatch(tokensA.firstName, tokensB.firstName);
|
|
const lastMatch = computeFieldMatch(tokensA.lastName, tokensB.lastName);
|
|
const middleMatch = computeFieldMatch(tokensA.middleName, tokensB.middleName);
|
|
const numberMatch = computeFieldMatch(addressA.streetNumber, addressB.streetNumber);
|
|
const streetMatch = computeFieldMatch(addressA.streetName, addressB.streetName, normalizeString);
|
|
const typeMatch = computeFieldMatch(
|
|
addressA.streetType ? normalizeStreetType(addressA.streetType) : '',
|
|
addressB.streetType ? normalizeStreetType(addressB.streetType) : '',
|
|
);
|
|
const unitMatch = computeFieldMatch(addressA.unit || '', addressB.unit || '');
|
|
const cityMatch = computeFieldMatch(addressA.city, addressB.city);
|
|
const stateMatch = computeFieldMatch(addressA.state, addressB.state);
|
|
const zipMatch = computeFieldMatch(addressA.zip, addressB.zip);
|
|
|
|
const normalizedA = normalizeAddress(addressA);
|
|
const normalizedB = normalizeAddress(addressB);
|
|
|
|
const dist = levenshteinDistance(
|
|
normalizeString(nameA),
|
|
normalizeString(nameB),
|
|
);
|
|
|
|
const details: MatchDetails = {
|
|
nameNormalized: [normalizeString(nameA), normalizeString(nameB)],
|
|
addressNormalized: [normalizedA, normalizedB],
|
|
levenshteinDistance: dist,
|
|
geocodingDistance,
|
|
fields: {
|
|
firstName: firstMatch,
|
|
lastName: lastMatch,
|
|
middleName: middleMatch,
|
|
streetNumber: numberMatch,
|
|
streetName: streetMatch,
|
|
streetType: typeMatch,
|
|
unit: unitMatch,
|
|
city: cityMatch,
|
|
state: stateMatch,
|
|
zip: zipMatch,
|
|
},
|
|
};
|
|
|
|
return {
|
|
nameScore,
|
|
addressScore,
|
|
overallConfidence,
|
|
isMatch: overallConfidence >= effectiveConfig.overallThreshold,
|
|
details,
|
|
};
|
|
}
|
|
|
|
export function getConfigForPropertyType(type: PropertyType): MatchingConfig {
|
|
return { ...DEFAULT_CONFIG, ...PROPERTY_TYPE_CONFIGS[type] };
|
|
}
|
|
|
|
export { parseName, normalizeString, normalizeStreetType, levenshteinDistance, similarityScore };
|