Sorensen Dice

Sorensen Dice 相似度系数:集合交集的 2 倍除以两个集合相加(并不是并集),顺序无关

java code

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/**
 * Sorensen Dice 相似度系数
 * 计算简单集合之间相似度
 * 集合交集的 2 倍除以两个集合相加(并不是并集)
 *
 *             2| A ∩ B |
 * f(A,B)=   ---------------
 *              |A| + |B|
 *
 * @param a 样本a
 * @param b 样本b
 * @return 相似度
 */
float dice(String a,String b)
{
    // 都为空相似度为 1
    if (a == null && b == null)
        return 1f;
    if (a == null || b == null)
        return 0f;
    Set<Integer> aChar = a.chars().boxed().collect(Collectors.toSet());
    Set<Integer> bChar = b.chars().boxed().collect(Collectors.toSet());
    int all=aChar.size()+ bChar.size();
    // 交集
    aChar.retainAll(bChar);
    if(aChar.size()==0)
        return 0;
    return ( 2 * (float) aChar.size() ) / ( float ) all;
}

jshell test

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
jshell> /open d://jshell/dice.txt

jshell> dice("abcde","abcde")
$2 ==> 1.0

jshell> dice("abcde","abcdf")
$3 ==> 0.8

jshell> dice("abcde","abfde")
$4 ==> 0.8

jshell> dice("abcde","abcif")
$5 ==> 0.6

jshell> dice("abcde","abjif")
$6 ==> 0.4

jshell> dice("abcde","akjif")
$7 ==> 0.2

jshell> dice("abcde","lkjif")
$8 ==> 0.0