Soundex for faster search
Summary
Soundex is a phonetic algorithm that is used to index and search for words based on their pronunciation rather than their spelling. The algorithm assigns a code to each word based on the sound of its initial consonants. This code can then be used to find other words that have a similar sound, even if they are spelled differently.
The Soundex algorithm works by following a series of steps to convert each word into a standardized code. First, the word is converted to uppercase and all non-alphabetic characters are removed. Then, duplicate adjacent letters (except for “C”) are removed and each remaining letter is converted to a digit using a set of mapping rules. Finally, the resulting code is padded with zeros and truncated to a length of four characters. This code can be used to compare words and find matches based on their pronunciation, which can be useful in applications such as name matching, genealogy research, and database searching.
C# Code Sample
using System;
class Soundex
{
static void Main()
{
string name = "Robert";
string[] words = {"Rupert", "Smith", "Johnson", "Roberts", "Robinson"};
Console.WriteLine("Matching words for {0}:", name);
string soundexCode = GetSoundex(name);
foreach (string word in words)
{
if (GetSoundex(word) == soundexCode)
{
Console.WriteLine(word);
}
}
Console.ReadLine();
}
static string GetSoundex(string input)
{
// Step 1: Convert the input string to all uppercase
input = input.ToUpper();
// Step 2: Remove all non-alphabetic characters
input = System.Text.RegularExpressions.Regex.Replace(input, "[^A-Z]", "");
// Step 3: Remove duplicate adjacent letters (except for C, which is treated differently in step 4)
char[] inputChars = input.ToCharArray();
string output = "" + inputChars[0];
for (int i = 1; i < inputChars.Length; i++)
{
if (inputChars[i] != inputChars[i - 1])
{
output += inputChars[i];
}
}
// Step 4: Convert remaining letters to digits using the Soundex mapping
output = output.Replace("BFPV", "1");
output = output.Replace("CGJKQSXZ", "2");
output = output.Replace("DT", "3");
output = output.Replace("L", "4");
output = output.Replace("MN", "5");
output = output.Replace("R", "6");
output = output.Replace("C", ""); // Remove C (see step 3)
// Step 5: If the resulting string is less than four characters long, pad it with zeros
if (output.Length < 4)
{
output = output.PadRight(4, '0');
}
// Step 6: Return the first four characters
return output.Substring(0, 4);
}
}
C implementation for IoT Devices
#include <stdio.h>
#include <ctype.h>
#include <string.h>
void get_soundex(char *input, char *output);
int main()
{
char name[] = "Robert";
char *words[] = {"Rupert", "Smith", "Johnson", "Roberts", "Robinson"};
int num_words = sizeof(words) / sizeof(char *);
char soundex_code[5];
printf("Matching words for %s:\n", name);
get_soundex(name, soundex_code);
for (int i = 0; i < num_words; i++)
{
char word_soundex[5];
get_soundex(words[i], word_soundex);
if (strcmp(soundex_code, word_soundex) == 0)
{
printf("%s\n", words[i]);
}
}
return 0;
}
void get_soundex(char *input, char *output)
{
// Step 1: Convert the input string to all uppercase
for (int i = 0; i < strlen(input); i++)
{
input[i] = toupper(input[i]);
}
// Step 2: Remove all non-alphabetic characters
char cleaned_input[strlen(input)];
int j = 0;
for (int i = 0; i < strlen(input); i++)
{
if (isalpha(input[i]))
{
cleaned_input[j++] = input[i];
}
}
cleaned_input[j] = '\0';
// Step 3: Remove duplicate adjacent letters (except for C, which is treated differently in step 4)
char output_buffer[strlen(cleaned_input)];
int k = 0;
output_buffer[k++] = cleaned_input[0];
for (int i = 1; i < strlen(cleaned_input); i++)
{
if (cleaned_input[i] != cleaned_input[i - 1])
{
if (cleaned_input[i] != 'H' && cleaned_input[i] != 'W')
{
output_buffer[k++] = cleaned_input[i];
}
}
}
output_buffer[k] = '\0';
// Step 4: Convert remaining letters to digits using the Soundex mapping
for (int i = 0; i < strlen(output_buffer); i++)
{
switch (output_buffer[i])
{
case 'B':
case 'F':
case 'P':
case 'V':
output[i] = '1';
break;
case 'C':
case 'G':
case 'J':
case 'K':
case 'Q':
case 'S':
case 'X':
case 'Z':
output[i] = '2';
break;
case 'D':
case 'T':
output[i] = '3';
break;
case 'L':
output[i] = '4';
break;
case 'M':
case 'N':
output[i] = '5';
break;
case 'R':
output[i] = '6';
break;
default:
output[i] = '0';
break;
}
}
// Step 5: If the resulting string is less than four characters long, pad it with zeros
int output_length = strlen(output_buffer);
if (output_length < 4)
{
for (int i = output_length; i < 4; i++)
{
output[i] = '0';
}
output[4] = '\0