map 底层实现总结

Kesa...大约 6 分钟

以 gov1.18open in new window为例总结 map 的底层实现。

1. 数据结构

1.1 hmap

// A header for a Go map.
type hmap struct {
	// Note: the format of the hmap is also encoded in cmd/compile/internal/reflectdata/reflect.go.
	// Make sure this stays in sync with the compiler's definition.
	count     int // # live cells == size of map.  Must be first (used by len() builtin)
	flags     uint8
	B         uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
	noverflow uint16 // approximate number of overflow buckets; see incrnoverflow for details
	hash0     uint32 // hash seed

	buckets    unsafe.Pointer // array of 2^B Buckets. may be nil if count==0.
	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)

	extra *mapextra // optional fields
}

// mapextra holds fields that are not present on all maps.
type mapextra struct {
	// If both key and elem do not contain pointers and are inline, then we mark bucket
	// type as containing no pointers. This avoids scanning such maps.
	// However, bmap.overflow is a pointer. In order to keep overflow buckets
	// alive, we store pointers to all overflow buckets in hmap.extra.overflow and hmap.extra.oldoverflow.
	// overflow and oldoverflow are only used if key and elem do not contain pointers.
	// overflow contains overflow buckets for hmap.buckets.
	// oldoverflow contains overflow buckets for hmap.oldbuckets.
	// The indirection allows to store a pointer to the slice in hiter.
	overflow    *[]*bmap
	oldoverflow *[]*bmap

	// nextOverflow holds a pointer to a free overflow bucket.
	nextOverflow *bmap
}

hamp：

count : 哈希表中元素的数量
B：用于表示哈希表 buckets数量；bukets数量为 $2^B$
overflow：溢出桶的近似数量
hash0：哈希种子
buckets：存储桶数组
oldbukets：进入扩容状态后，旧存储桶数组

mapextra：

overflow：溢出桶数组
oldoverflow：进入扩容状态后，旧溢出桶数组
nextOverflow：指向下一个可用溢出桶

1.2 bmap

bmap定义中只有一个字段：

type bmap struct {
	tophash [bucketCnt]uint8
}

其余字段在编译器添加，重建后的结果如下：

type bmap struct {
    topbits  [8]uint8
    keys     [8]keytype
    values   [8]valuetype
    pad      uintptr
    overflow uintptr
}

topbits：哈希值高八位；长度为 8 的数组
keys：key；长度为 8 的数组
valuse：value；长度为 8 的数组

正常桶和溢出桶构成单向链表。

2. 访问操作

编译期

根据表达式左边的变量数量决定条用的函数：

只有一个变量，v := hash[key]；调用函数 runtime.mapaccess1
两个变量，v, ok := hash[key]；调用函数runtime.mapaccess2

mapaccess2 会多返回一个 bool 类型值，表示 key 是否存在

运行时

func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
	...
	hash := t.hasher(key, uintptr(h.hash0))
	m := bucketMask(h.B)
	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
	if c := h.oldbuckets; c != nil {
		if !h.sameSizeGrow() {
			// There used to be half as many buckets; mask down one more power of two.
			m >>= 1
		}
		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
		if !evacuated(oldb) {
			b = oldb
		}
	}
	top := tophash(hash)
bucketloop:
	for ; b != nil; b = b.overflow(t) {
		for i := uintptr(0); i < bucketCnt; i++ {
			if b.tophash[i] != top {
				if b.tophash[i] == emptyRest {
					break bucketloop
				}
				continue
			}
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			if t.indirectkey() {
				k = *((*unsafe.Pointer)(k))
			}
			if t.key.equal(key, k) {
				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
				if t.indirectelem() {
					e = *((*unsafe.Pointer)(e))
				}
				return e
			}
		}
	}
	return unsafe.Pointer(&zeroVal[0])
}

主要流程：

计算 key 的 hash 值
计算存储桶索引位置，哈希值 mod 桶数组长度：
- 计算掩码 (1 << B) - 1
- 进行按位与运算，得到 hash 值的低 B 位；
若处于扩容中，则尝试从旧桶中获取数据
获取 hash 值高八位
遍历桶及其溢出桶，直到找到 value 或遍历结束
- 比较 hash 值高八位
- 相同，则比较 key
- key 相同，则查找 value，返回结果

3. 写入

编译期

解析表达式，转换成调用 runtime.mapassign

运行时

// Like mapaccess, but allocates a slot for the key if it is not present in the map.
func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
	...
	hash := t.hasher(key, uintptr(h.hash0))

	// Set hashWriting after calling t.hasher, since t.hasher may panic,
	// in which case we have not actually done a write.
	h.flags ^= hashWriting

	if h.buckets == nil {
		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
	}

again:
	bucket := hash & bucketMask(h.B)
	if h.growing() {
		growWork(t, h, bucket)
	}
	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
	top := tophash(hash)

	var inserti *uint8
	var insertk unsafe.Pointer
	var elem unsafe.Pointer
bucketloop:
	for {
		for i := uintptr(0); i < bucketCnt; i++ {
			if b.tophash[i] != top {
				if isEmpty(b.tophash[i]) && inserti == nil {
					inserti = &b.tophash[i]
					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
					elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
				}
				if b.tophash[i] == emptyRest {
					break bucketloop
				}
				continue
			}
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			if t.indirectkey() {
				k = *((*unsafe.Pointer)(k))
			}
			if !t.key.equal(key, k) {
				continue
			}
			// already have a mapping for key. Update it.
			if t.needkeyupdate() {
				typedmemmove(t.key, k, key)
			}
			elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
			goto done
		}
		ovf := b.overflow(t)
		if ovf == nil {
			break
		}
		b = ovf
	}

	// Did not find mapping for key. Allocate new cell & add entry.

	// If we hit the max load factor or we have too many overflow buckets,
	// and we're not already in the middle of growing, start growing.
	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
		hashGrow(t, h)
		goto again // Growing the table invalidates everything, so try again
	}

	if inserti == nil {
		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
		newb := h.newoverflow(t, b)
		inserti = &newb.tophash[0]
		insertk = add(unsafe.Pointer(newb), dataOffset)
		elem = add(insertk, bucketCnt*uintptr(t.keysize))
	}

	// store new key/elem at insert position
	if t.indirectkey() {
		kmem := newobject(t.key)
		*(*unsafe.Pointer)(insertk) = kmem
		insertk = kmem
	}
	if t.indirectelem() {
		vmem := newobject(t.elem)
		*(*unsafe.Pointer)(elem) = vmem
	}
	typedmemmove(t.key, insertk, key)
	*inserti = top
	h.count++

done:
	if h.flags&hashWriting == 0 {
		throw("concurrent map writes")
	}
	h.flags &^= hashWriting
	if t.indirectelem() {
		elem = *((*unsafe.Pointer)(elem))
	}
	return elem
}

主要流程：

计算 key 的 hash 值
计算存储桶的索引位置
若此时处于扩容状态，触发一次扩容操作，对桶中的数据进行分流
获取 tophash
遍历桶及其溢出桶
- 若 tophash 为空，插入此位置并插入 key 和 value，结束。
- 若不为空，则比较 tophash；若不同，则继续步骤 5）
- 相同，则比较 key ；若不同，则继续步骤 5）
- 若 key 相同，则更新 value；
判断是否需要扩容，若需要则继续步骤 2)
遍历结束后，若未找到插入位置，则说明桶已满；创建新的溢出桶，并插入

4. 删除

编译期

将表达式delete(hash, key)转换成runtime.mapdelete系列函数中的一个。

运行时

func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
	...
	hash := t.hasher(key, uintptr(h.hash0))

	// Set hashWriting after calling t.hasher, since t.hasher may panic,
	// in which case we have not actually done a write (delete).
	h.flags ^= hashWriting

	bucket := hash & bucketMask(h.B)
	if h.growing() {
		growWork(t, h, bucket)
	}
	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
	bOrig := b
	top := tophash(hash)
search:
	for ; b != nil; b = b.overflow(t) {
		for i := uintptr(0); i < bucketCnt; i++ {
			if b.tophash[i] != top {
				if b.tophash[i] == emptyRest {
					break search
				}
				continue
			}
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			k2 := k
			if t.indirectkey() {
				k2 = *((*unsafe.Pointer)(k2))
			}
			if !t.key.equal(key, k2) {
				continue
			}
			// Only clear key if there are pointers in it.
			if t.indirectkey() {
				*(*unsafe.Pointer)(k) = nil
			} else if t.key.ptrdata != 0 {
				memclrHasPointers(k, t.key.size)
			}
			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
			if t.indirectelem() {
				*(*unsafe.Pointer)(e) = nil
			} else if t.elem.ptrdata != 0 {
				memclrHasPointers(e, t.elem.size)
			} else {
				memclrNoHeapPointers(e, t.elem.size)
			}
			b.tophash[i] = emptyOne
			// If the bucket now ends in a bunch of emptyOne states,
			// change those to emptyRest states.
			// It would be nice to make this a separate function, but
			// for loops are not currently inlineable.
			if i == bucketCnt-1 {
				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
					goto notLast
				}
			} else {
				if b.tophash[i+1] != emptyRest {
					goto notLast
				}
			}
			for {
				b.tophash[i] = emptyRest
				if i == 0 {
					if b == bOrig {
						break // beginning of initial bucket, we're done.
					}
					// Find previous bucket, continue at its last entry.
					c := b
					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
					}
					i = bucketCnt - 1
				} else {
					i--
				}
				if b.tophash[i] != emptyOne {
					break
				}
			}
		notLast:
			h.count--
			// Reset the hash seed to make it more difficult for attackers to
			// repeatedly trigger hash collisions. See issue 25237.
			if h.count == 0 {
				h.hash0 = fastrand()
			}
			break search
		}
	}

	if h.flags&hashWriting == 0 {
		throw("concurrent map writes")
	}
	h.flags &^= hashWriting
}

主要流程：

计算 key 的 hash 值
计算桶的索引
若处于扩容状态，则触发一次扩容操作
遍历桶及其溢出桶，查找 key，若找到则将 key 和 value 删除

5. 扩容

扩容条件

装载因子超过 6.5
溢出桶过多；溢出桶的数量近似和正常桶数量一样多（小于 $2^{15}$ 使用准确值，大于等于则使用近似值）

扩容类型

翻倍扩容，状态因子超过6.5
等量扩容，溢出桶过多

扩容流程

等量扩容：

创建新的桶数组，将旧桶数据以一对一关系进行迁移

翻倍扩容：

创建大小为旧桶两倍的新桶，将一个旧桶的数据分流到两个新桶

扩容操作时机

扩容并不是在一次完成，而是在写入和删除操作时对对当前操作的桶进行一次扩容操作。

Reference

昵称

邮箱

网址

按正序
按倒序
按热度

map 底层实现总结

# 1. 数据结构

# 1.1 hmap

# 1.2 bmap

# 2. 访问操作

# 编译期

# 运行时

# 3. 写入

# 编译期

# 运行时

# 4. 删除

# 编译期

# 运行时

# 5. 扩容

# 扩容条件

# 扩容类型

# 扩容流程

# 扩容操作时机

# Reference

预览:

1. 数据结构

1.1 hmap

1.2 bmap

2. 访问操作

编译期

运行时

3. 写入

编译期

运行时

4. 删除

编译期

运行时

5. 扩容

扩容条件

扩容类型

扩容流程

扩容操作时机

Reference