Soundex for faster search

Summary

Soundex is a phonetic algorithm that is used to index and search for words based on their pronunciation rather than their spelling. The algorithm assigns a code to each word based on the sound of its initial consonants. This code can then be used to find other words that have a similar sound, even if they are spelled differently.

The Soundex algorithm works by following a series of steps to convert each word into a standardized code. First, the word is converted to uppercase and all non-alphabetic characters are removed. Then, duplicate adjacent letters (except for “C”) are removed and each remaining letter is converted to a digit using a set of mapping rules. Finally, the resulting code is padded with zeros and truncated to a length of four characters. This code can be used to compare words and find matches based on their pronunciation, which can be useful in applications such as name matching, genealogy research, and database searching.

C# Code Sample

using System;

class Soundex
{
    static void Main()
    {
        string name = "Robert";
        string[] words = {"Rupert", "Smith", "Johnson", "Roberts", "Robinson"};

        Console.WriteLine("Matching words for {0}:", name);

        string soundexCode = GetSoundex(name);

        foreach (string word in words)
        {
            if (GetSoundex(word) == soundexCode)
            {
                Console.WriteLine(word);
            }
        }

        Console.ReadLine();
    }

    static string GetSoundex(string input)
    {
        // Step 1: Convert the input string to all uppercase
        input = input.ToUpper();

        // Step 2: Remove all non-alphabetic characters
        input = System.Text.RegularExpressions.Regex.Replace(input, "[^A-Z]", "");

        // Step 3: Remove duplicate adjacent letters (except for C, which is treated differently in step 4)
        char[] inputChars = input.ToCharArray();
        string output = "" + inputChars[0];
        for (int i = 1; i < inputChars.Length; i++)
        {
            if (inputChars[i] != inputChars[i - 1])
            {
                output += inputChars[i];
            }
        }

        // Step 4: Convert remaining letters to digits using the Soundex mapping
        output = output.Replace("BFPV", "1");
        output = output.Replace("CGJKQSXZ", "2");
        output = output.Replace("DT", "3");
        output = output.Replace("L", "4");
        output = output.Replace("MN", "5");
        output = output.Replace("R", "6");
        output = output.Replace("C", ""); // Remove C (see step 3)

        // Step 5: If the resulting string is less than four characters long, pad it with zeros
        if (output.Length < 4)
        {
            output = output.PadRight(4, '0');
        }

        // Step 6: Return the first four characters
        return output.Substring(0, 4);
    }
}

C implementation for IoT Devices

#include <stdio.h>
#include <ctype.h>
#include <string.h>

void get_soundex(char *input, char *output);

int main()
{
    char name[] = "Robert";
    char *words[] = {"Rupert", "Smith", "Johnson", "Roberts", "Robinson"};
    int num_words = sizeof(words) / sizeof(char *);
    char soundex_code[5];

    printf("Matching words for %s:\n", name);
    get_soundex(name, soundex_code);

    for (int i = 0; i < num_words; i++)
    {
        char word_soundex[5];
        get_soundex(words[i], word_soundex);

        if (strcmp(soundex_code, word_soundex) == 0)
        {
            printf("%s\n", words[i]);
        }
    }

    return 0;
}

void get_soundex(char *input, char *output)
{
    // Step 1: Convert the input string to all uppercase
    for (int i = 0; i < strlen(input); i++)
    {
        input[i] = toupper(input[i]);
    }

    // Step 2: Remove all non-alphabetic characters
    char cleaned_input[strlen(input)];
    int j = 0;
    for (int i = 0; i < strlen(input); i++)
    {
        if (isalpha(input[i]))
        {
            cleaned_input[j++] = input[i];
        }
    }
    cleaned_input[j] = '\0';

    // Step 3: Remove duplicate adjacent letters (except for C, which is treated differently in step 4)
    char output_buffer[strlen(cleaned_input)];
    int k = 0;
    output_buffer[k++] = cleaned_input[0];
    for (int i = 1; i < strlen(cleaned_input); i++)
    {
        if (cleaned_input[i] != cleaned_input[i - 1])
        {
            if (cleaned_input[i] != 'H' && cleaned_input[i] != 'W')
            {
                output_buffer[k++] = cleaned_input[i];
            }
        }
    }
    output_buffer[k] = '\0';

    // Step 4: Convert remaining letters to digits using the Soundex mapping
    for (int i = 0; i < strlen(output_buffer); i++)
    {
        switch (output_buffer[i])
        {
            case 'B':
            case 'F':
            case 'P':
            case 'V':
                output[i] = '1';
                break;
            case 'C':
            case 'G':
            case 'J':
            case 'K':
            case 'Q':
            case 'S':
            case 'X':
            case 'Z':
                output[i] = '2';
                break;
            case 'D':
            case 'T':
                output[i] = '3';
                break;
            case 'L':
                output[i] = '4';
                break;
            case 'M':
            case 'N':
                output[i] = '5';
                break;
            case 'R':
                output[i] = '6';
                break;
            default:
                output[i] = '0';
                break;
        }
    }

    // Step 5: If the resulting string is less than four characters long, pad it with zeros
    int output_length = strlen(output_buffer);
    if (output_length < 4)
    {
        for (int i = output_length; i < 4; i++)
        {
            output[i] = '0';
        }
        output[4] = '\0