看到HashSet源码的时候,我惊了一下,因为我发现HashSet的内部实现跟Dictionary几乎是一模一样的,传送门:抛除C++旧印象(二):C#Dictionary源码剖析。差别仅在于HashSet没有key,只有value,所以存储的时候是用value的HashCode值进行判断。
我们先来看下HashSet<T>类开始部分的代码(经过我稍微的整理):
public class HashSet<T> : ICollection<T>, ISerializable, IDeserializationCallback, ISet<T>, IReadOnlyCollection<T> { private const int Lower31BitMask = 0x7FFFFFFF; private const int StackAllocThreshold = 100; private const int ShrinkThreshold = 3; #if !SILVERLIGHT // constants for serialization private const String CapacityName = "Capacity"; private const String ElementsName = "Elements"; private const String ComparerName = "Comparer"; private const String VersionName = "Version"; #endif internal struct Slot { internal int hashCode; // Lower 31 bits of hash code, -1 if unused internal T value; internal int next; // Index of next entry, -1 if last } private int[] m_buckets; private Slot[] m_slots; private int m_count; private int m_lastIndex; private int m_freeList; private IEqualityComparer<T> m_comparer; private int m_version; 对比Dictionary你会发现连索引数组m_buckets名字都一样,值数组结构体Slot唯一的差别在于Dictionary的结构体Entry多了个TKey字段。接下来我们再来看看HashSet<T>的Add函数,明白他是怎么插入的,对他的结构就很一目了然了:
void ICollection<T>.Add(T item) { AddIfNotPresent(item); } private bool AddIfNotPresent(T value) { if (m_buckets == null) { Initialize(0); } int hashCode = InternalGetHashCode(value);//得到value的hashcode&0x7FFFFFFF的值 int bucket = hashCode % m_buckets.Length; //对当前数组长度取余,得到具体的索引值 #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE int collisionCount = 0; #endif for (int i = m_buckets[hashCode % m_buckets.Length] - 1; i >= 0; i = m_slots[i].next) {//索引值指向值数组m_slots,进行遍历查找 if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, value)) {//如果已存在,则直接返回 return false; } #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE collisionCount++; #endif } int index; if (m_freeList >= 0) {//判断是否存在空链(Remove操作后留下的空内存块) index = m_freeList; m_freeList = m_slots[index].next; } else { if (m_lastIndex == m_slots.Length) {//如果已达到最大存储值,则重新分配新的内存块进行存储 IncreaseCapacity(); // this will change during resize bucket = hashCode % m_buckets.Length; } index = m_lastIndex;//没有空链,则放在数组的末尾 m_lastIndex++; } m_slots[index].hashCode = hashCode;//进行存储 m_slots[index].value = value; m_slots[index].next = m_buckets[bucket] - 1; m_buckets[bucket] = index + 1; m_count++; m_version++; #if FEATURE_RANDOMIZED_STRING_HASHING && !FEATURE_NETCORE if(collisionCount > HashHelpers.HashCollisionThreshold && HashHelpers.IsWellKnownEqualityComparer(m_comparer)) { m_comparer = (IEqualityComparer<T>) HashHelpers.GetRandomizedEqualityComparer(m_comparer); SetCapacity(m_buckets.Length, true); } #endif // FEATURE_RANDOMIZED_STRING_HASHING return true; } 如果对上面的代码不够理解,建议先看完上篇Dictionary。个人看完上面这段代码后着实不解,因为存储索引的时候用了各种的+1,-1,好蠢,因为结构跟Dictionary完全一致,所以直接把dictionary的代码拿过来稍微改改就行了,而且存储索引根本没必要+1,-1,直接储存相应的值不就完了吗?真没想到微软员工写的代码也这么糟糕啊,逻辑不清晰,因为储存的数值不是原值,所以其他接口也是各种的+1,-1,醉了:
public bool Remove(T item) { if (m_buckets != null) { int hashCode = InternalGetHashCode(item); int bucket = hashCode % m_buckets.Length; int last = -1; for (int i = m_buckets[bucket] - 1; i >= 0; last = i, i = m_slots[i].next) { if (m_slots[i].hashCode == hashCode && m_comparer.Equals(m_slots[i].value, item)) { if (last < 0) { // first iteration; update buckets m_buckets[bucket] = m_slots[i].next + 1; } else { // subsequent iterations; update 'next' pointers m_slots[last].next = m_slots[i].next; } m_slots[i].hashCode = -1; m_slots[i].value = default(T); m_slots[i].next = m_freeList; m_count--; m_version++; if (m_count == 0) { m_lastIndex = 0; m_freeList = -1; } else { m_freeList = i; } return true; } } } // either m_buckets is null or wasn't found return false; } 结论:只要你理解了Dictionary,那么hashSet就是少了个Key的Dictionary,完全没必要再看了,代码还稍微有点辣眼睛0.0。