Skip to content

Commit fe4b357

Browse files
authored
Add Jaccard Similarity (TheAlgorithms#419)
1 parent 0677638 commit fe4b357

File tree

5 files changed

+220
-0
lines changed

5 files changed

+220
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using System;
2+
using Algorithms.Strings.Similarity;
3+
using FluentAssertions;
4+
using NUnit.Framework;
5+
6+
namespace Algorithms.Tests.Strings.Similarity;
7+
8+
public class JaccardDistanceTests
9+
{
10+
private readonly JaccardDistance jaccard = new JaccardDistance();
11+
private readonly double precision = 0.0001;
12+
13+
[TestCase("left", null)]
14+
[TestCase(null, "right")]
15+
[TestCase(null, null)]
16+
public void Calculate_WhenStringsAreNull_ThrowsArgumentNullException(string left, string right)
17+
{
18+
Action action = () => jaccard.Calculate(left, right);
19+
action.Should().Throw<ArgumentNullException>();
20+
}
21+
22+
23+
[TestCase("", "", 0.0d)]
24+
[TestCase("left", "", 1.0d)]
25+
[TestCase("", "right", 1.0d)]
26+
[TestCase("frog", "fog", 0.25d)]
27+
[TestCase("fly", "ant", 1.0d)]
28+
[TestCase("elephant", "hippo", 0.777777d)]
29+
[TestCase("ABC Corporation", "ABC Corp", 0.36363d)]
30+
public void Calculate_WhenProvidedWithStrings_CalculatesCorrectDistance(string left, string right, double expected)
31+
{
32+
var distance = jaccard.Calculate(left, right);
33+
34+
distance.Should().BeApproximately(expected, precision);
35+
}
36+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
using System;
2+
using Algorithms.Strings.Similarity;
3+
using FluentAssertions;
4+
using NUnit.Framework;
5+
6+
namespace Algorithms.Tests.Strings.Similarity;
7+
8+
public class JaccardSimilarityTests
9+
{
10+
private readonly JaccardSimilarity jaccard = new JaccardSimilarity();
11+
private readonly double precision = 0.0001;
12+
13+
[TestCase("left", null)]
14+
[TestCase(null, "right")]
15+
[TestCase(null, null)]
16+
public void Calculate_WhenStringsAreNull_ThrowsArgumentNullException(string left, string right)
17+
{
18+
Action action = () => jaccard.Calculate(left, right);
19+
action.Should().Throw<ArgumentNullException>();
20+
}
21+
22+
[TestCase("", "", 1.0d)]
23+
[TestCase("left", "", 0.0d)]
24+
[TestCase("", "right", 0.0d)]
25+
[TestCase("frog", "fog", 0.75d)]
26+
[TestCase("fly", "ant", 0.0d)]
27+
[TestCase("elephant", "hippo", 0.22222d)]
28+
[TestCase("ABC Corporation", "ABC Corp", 0.636363d)]
29+
public void Calculate_WhenProvidedWithStrings_CalculatesTheCorrectDistance(string left, string right, double expected)
30+
{
31+
var similarity = jaccard.Calculate(left, right);
32+
33+
similarity.Should().BeApproximately(expected, precision);
34+
}
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
namespace Algorithms.Strings.Similarity;
2+
3+
/// <summary>
4+
/// <para>
5+
/// Jaccard distance is a measure of two sets of data are. It is calculated by subtracting the Jaccard similarity
6+
/// coefficient from 1, or, equivalently by dividing the difference of the sizes of the union and intersection of two sets
7+
/// by the size of the union.
8+
/// </para>
9+
/// <para>
10+
/// For example, suppose we have two sets of words:
11+
/// <list type="bullet">
12+
/// <item>
13+
/// A = {apple, banana, cherry, date}
14+
/// </item>
15+
/// <item>
16+
/// B = {banana, cherry, elderberry, fig}
17+
/// </item>
18+
/// </list>
19+
/// </para>
20+
/// <para>
21+
/// The number of common elements in both sets is 2 (banana and cherry). The number of elements in either set is 6
22+
/// (apple, banana, cherry, date, elderberry, fig).
23+
/// </para>
24+
/// <para>
25+
/// The Jaccard similarity coefficient is 2 / 6 = 0.333333 or 33.333% similarity.
26+
/// </para>
27+
/// <para>
28+
/// The Jaccard distance is 1 - 0.33333 = 0.66667. This means that the two sets are about 67% different.
29+
/// </para>
30+
/// <para>
31+
/// Jaccard distance is commonly used to calculate a matrix of clustering and multidimensional scaling of sample tests.
32+
/// </para>
33+
/// </summary>
34+
public class JaccardDistance
35+
{
36+
private readonly JaccardSimilarity jaccardSimilarity = new();
37+
38+
/// <summary>
39+
/// Calculate the Jaccard distance between to strings.
40+
/// </summary>
41+
/// <param name="left">The first string.</param>
42+
/// <param name="right">The second string.</param>
43+
/// <returns>The Jaccard distance.</returns>
44+
public double Calculate(string left, string right)
45+
{
46+
return 1.0 - jaccardSimilarity.Calculate(left, right);
47+
}
48+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
using System;
2+
using System.Collections.Generic;
3+
4+
namespace Algorithms.Strings.Similarity;
5+
6+
/// <summary>
7+
/// <para>
8+
/// Jaccard similarity is a statistic that measures how similar two sets of data are. It is calculated by dividing
9+
/// the number of common elements in both sets by the number of elements in either set. More formally, it is the
10+
/// quotient of the division of the size of the size of the intersection divided by the size of the union of two sets.
11+
/// </para>
12+
/// <para>
13+
/// The result is a value between 0 and 1, where 0 means no similarity and 1 means perfect similarity.
14+
/// </para>
15+
/// <para>
16+
/// For example, suppose we have two sets of words:
17+
/// <list type="bullet">
18+
/// <item>
19+
/// A = {apple, banana, cherry, date}
20+
/// </item>
21+
/// <item>
22+
/// B = {banana, cherry, elderberry, fig}
23+
/// </item>
24+
/// </list>
25+
/// </para>
26+
/// <para>
27+
/// The number of common elements in both sets is 2 (banana and cherry). The number of elements in either set is 6
28+
/// (apple, banana, cherry, date, elderberry, fig).
29+
/// </para>
30+
/// <para>
31+
/// The Jaccard similarity coefficient is 2 / 6 = 0.333333 or 33.333% similarity.
32+
/// </para>
33+
/// </summary>
34+
public class JaccardSimilarity
35+
{
36+
/// <summary>
37+
/// Calculates the Jaccard similarity coefficient between two strings.
38+
/// </summary>
39+
/// <param name="left">The first string to compare.</param>
40+
/// <param name="right">The second string to compare.</param>
41+
/// <returns>A double value between 0 and 1 that represents the similarity of the two strings.</returns>
42+
/// <exception cref="ArgumentNullException">Thrown when either the input is null.</exception>
43+
/// <remarks>
44+
/// This method uses a HashSet to represent the sets of characters in the input strings.
45+
/// </remarks>
46+
public double Calculate(string left, string right)
47+
{
48+
// Validate the input strings before proceeding.
49+
ValidateInput(left, right);
50+
51+
// Get the lengths of the input strings.
52+
var leftLength = left.Length;
53+
var rightLength = right.Length;
54+
55+
// If both strings are empty, return 1.0 as the similarity coefficient.
56+
if (leftLength == 0 && rightLength == 0)
57+
{
58+
return 1.0d;
59+
}
60+
61+
// If either string is empty, return 0.0 as the similarity coefficient.
62+
if (leftLength == 0 || rightLength == 0)
63+
{
64+
return 0.0d;
65+
}
66+
67+
// Get the unique characters in each string.
68+
var leftSet = new HashSet<char>(left);
69+
var rightSet = new HashSet<char>(right);
70+
71+
// Get the union of the two strings.
72+
var unionSet = new HashSet<char>(leftSet);
73+
foreach (var c in rightSet)
74+
{
75+
unionSet.Add(c);
76+
}
77+
78+
// Calculate the intersection size of the two strings.
79+
var intersectionSize = leftSet.Count + rightSet.Count - unionSet.Count;
80+
81+
// Return the Jaccard similarity coefficient as the ratio of intersection to union.
82+
return 1.0d * intersectionSize / unionSet.Count;
83+
}
84+
85+
/// <summary>
86+
/// Validates the input strings and throws an exception if either is null.
87+
/// </summary>
88+
/// <param name="left">The first string to validate.</param>
89+
/// <param name="right">The second string to validate.</param>
90+
private void ValidateInput(string left, string right)
91+
{
92+
if (left == null || right == null)
93+
{
94+
var paramName = left == null ? nameof(left) : nameof(right);
95+
throw new ArgumentNullException(paramName, "Input cannot be null");
96+
}
97+
}
98+
}

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ find more than one implementation for the same objective but using different alg
178178
* [A019434 Fermat Primes](./Algorithms/Sequences/FermatPrimesSequence.cs)
179179
* [A181391 Van Eck's](./Algorithms/Sequences/VanEcksSequence.cs)
180180
* [String](./Algorithms/Strings)
181+
* [Similarity](./Algorithms/Strings/Similarity/)
182+
* [Jaccard Similarity](./Algorithms/Strings/Similarity/JaccardSimilarity.cs)
183+
* [Jaccard Distance](./Algorithms/Strings/Similarity/JaccardDistance.cs)
181184
* [Longest Consecutive Character](./Algorithms/Strings/GeneralStringAlgorithms.cs)
182185
* [Naive String Search](./Algorithms/Strings/NaiveStringSearch.cs)
183186
* [Rabin Karp](./Algorithms/Strings/RabinKarp.cs)

0 commit comments

Comments
 (0)