前言
本篇主要是对DPDK的EAL(Environment Abstraction Layer)中内存的初始化流程进行总结,由于DPDK支持多进程应用,此篇总结主要针对primary process主进程流程进行跟踪总结,先了解下主次进程概念,如下:
1,在DPDK中,初始化由primary process完成。而其他process统称为secondary process,其可以通过读取一些文件来获取primary process的初始化信息,从而使得自身与primary process保持相同的内存映像。
2, DPDK采用了一种集中式控制的方式,比如在多进程的场景中,若一个secondary process要申请内存,则向primary process发起请求,由primary process完成相应操作后在通知secondary process。
一、初始化相关的代码调用流程
从lib/librte_eal/linux/eal/eal.c中的函数int rte_eal_init(int argc,char **argv)开始,内存的初始化调用栈依次为:
int rte_eal_init()
----eal_reset_internal_config()
----rte_config_init()
----eal_hugepage_info_init()
----rte_eal_memzone_init()
----rte_eal_memory_init()
----rte_eal_malloc_heap_init()
下面依次对这几个方面进行解析:
int
rte_eal_init(int argc, char **argv){
······
eal_reset_internal_config(&internal_config);
rte_config_init();
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
eal_hugepage_info_init() :
eal_hugepage_info_read();
······
}
······
if (rte_eal_memzone_init() < 0) { ······ }
if (rte_eal_memory_init() < 0) { ······ }
if (rte_eal_malloc_heap_init() < 0) { ······ }
}
1、eal_reset_internal_config()初始化全局变量internal_config;
结构体主要成员定义如下:
struct internal_config { //DPDK的全局配置信息
volatile size_t memory; /**< amount of asked memory */
//请求分配的内存数量
·······
volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */
//是否允许使用hugetlbfs
unsigned hugepage_unlink; /**< true to unlink backing files */
//是否删除hugepage文件(DPDK在memalloc时将每一个hugepage当做一个文件处理)
·······
volatile unsigned no_shconf; /**< true if there is no shared config */
//是否允许共享,不允许的话primary process不会将初始化信息写入到文件
volatile enum rte_proc_type_t process_type; /**< multi-process proc type */
//用于区分primary process, 或者secondary process
/** true to try allocating memory on specific sockets */
volatile unsigned force_sockets; //强制在指定的socket上分配内存
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */
//表示每一个socket分配的内存数量
volatile unsigned force_socket_limits; //设置是否限制socket分配的内存
volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */
//每一个socket分配的内存的上限
uintptr_t base_virtaddr; /**< base address to try and reserve memory from */
//从指定的虚拟地址分配内存
volatile unsigned legacy_mem;
//指明是legacy mode, 或者dynamic mode
volatile unsigned single_file_segments;
/**< true if storing all pages within single files (per-page-size,* per-node) non-legacy mode only.*/
//指明是single-file-segments mode, 或者 page-per-file mode
unsigned num_hugepage_sizes; /**< how many sizes on this system */
//系统支持的大页内存值,2M 、1G等
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
//大页内存信息保存,主要初始化结构体
};
对应的初始化函数把主要成员给初始值:
eal_reset_internal_config(struct internal_config *internal_cfg)
{
int i;
internal_cfg->memory = 0;
internal_cfg->force_nrank = 0;
internal_cfg->force_nchannel = 0;
internal_cfg->hugefile_prefix = NULL;
internal_cfg->hugepage_dir = NULL;
............
internal_cfg->create_uio_dev = 0;
internal_cfg->iova_mode = RTE_IOVA_DC;
internal_cfg->user_mbuf_pool_ops_name = NULL;
CPU_ZERO(&internal_cfg->ctrl_cpuset);
internal_cfg->init_complete = 0;
}
GDB看到的初始化值内容:
(gdb) p internal_config
$1 = {memory = 0, force_nchannel = 0, force_nrank = 0, no_hugetlbfs = 0, hugepage_unlink = 0, no_pci = 0,
no_hpet = 1, vmware_tsc_map = 0, no_shconf = 0, in_memory = 0, create_uio_dev = 0,
process_type = RTE_PROC_PRIMARY, force_sockets = 0, socket_mem = {0, 0, 0, 0, 0, 0, 0, 0},
force_socket_limits = 0, socket_limit = {0, 0, 0, 0, 0, 0, 0, 0}, base_virtaddr = 0, legacy_mem = 0,
single_file_segments = 0, syslog_facility = 24, vfio_intr_mode = RTE_INTR_MODE_NONE, hugefile_prefix = 0x0,
hugepage_dir = 0x0, user_mbuf_pool_ops_name = 0x0, num_hugepage_sizes = 0, hugepage_info = {{
hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0},
lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0,
0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>,
num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0,
hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}},
iova_mode = RTE_IOVA_DC, ctrl_cpuset = {__bits = {1, 0 <repeats 15 times>}}, init_complete = 0}
(gdb)
(gdb) p internal_config.process_type
$2 = RTE_PROC_PRIMARY
也就是后续初始化按照主流程RTE_PROC_PRIMARY进行初始化内存;
2、rte_config_init() :初始化内存配置
涉及结构体:
struct rte_config { //运行时环境的配置
······
/** PA or VA mapping mode */
enum rte_iova_mode iova_mode;
//指明了DMA使用虚拟地址(virtual address, 简称VA), 还是物理地址(physical address, 简称PA)
/**
* Pointer to memory configuration, which may be shared across multiple
* DPDK instances
*/
struct rte_mem_config *mem_config;
//这个指针指向的内存空间存放了一个DPDK instance的内存分布情况
//DPDK内存初始化过程主要是初始化struct rte_mem_config中的每一项
} __attribute__((__packed__));
struct rte_mem_config {
volatile uint32_t magic; /**< Magic number - Sanity check. */
/* memory topology */
uint32_t nchannel; /**< Number of channels (0 if unknown). */
uint32_t nrank; /**< Number of ranks (0 if unknown). */
······
/* memory segments amemnd zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */
//每一个struct rte_memseg_list中使用<socket id, pagesz>进行标识
//memsegs 可能存在多个具有相同<socket id, page_sz>的struct rte_memseg_list
struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
······
/* Heaps of Malloc */
struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
······
uint64_t mem_cfg_addr; //这个地址等于struct rte_config中的struct rte_mem_config *mem_config
/* legacy mem and single file segments options are shared */
uint32_t legacy_mem;
//指明内存是legacy mode, 还是dynamic mode
uint32_t single_file_segments;
// 指明memalloc是single-file-segments mode, 还是page-per-file mode
······
} __attribute__((__packed__));
涉及的代码:
/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
{
rte_config.process_type = internal_config.process_type;
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
rte_eal_config_create();
break;
}
}
这个函数主要是为struct rte_config中的struct rte_mem_config *mem_config(简称mcfg)申请一块内存空间,并且在运行时目录下创建一个名字为config的文件,并且将mcfg的内容写进此文件。这样,secondary process在初始化时就能通过读取config文件来创建和primary process一样的内存映像。
/* create memory configuration in shared/mmap memory. Take out
* a write lock on the memsegs, so we can auto-detect primary/secondary.
* This means we never close the file while running (auto-close on exit).
* We also don't lock the whole file, so that in future we can use read-locks
* on other parts, e.g. memzones, to detect if there are running secondary
* processes. */
static void
rte_eal_config_create(void)
{
void *rte_mem_cfg_addr;
int retval;
const char *pathname = eal_runtime_config_path();
/* map the config before hugepage address so that we don't waste a page */
if (internal_config.base_virtaddr != 0)
rte_mem_cfg_addr = (void *)
RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
else
rte_mem_cfg_addr = NULL;
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
if (mem_cfg_fd < 0)
rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
}
。。。。。。
。。。。。。
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
if (rte_mem_cfg_addr == MAP_FAILED){
rte_panic("Cannot mmap memory for rte_config\n");
}
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
rte_config.mem_config = rte_mem_cfg_addr;
}
在这里采用了mmap()的方式将config文件和mcfg进行了映射,所以在后面的初始化操作中,一旦对config进行了写操作,也能够立刻反映到其他的进程中(类似于使用共享内存通信);
# ls /var/run/dpdk/rte/config
/var/run/dpdk/rte/config
1
2
gdb看到的数据如下:
(gdb) p rte_config
$2 = {master_lcore = 1, lcore_count = 3, numa_node_count = 1, numa_nodes = {0, 0, 0, 0, 0, 0, 0, 0},
service_lcore_count = 0, lcore_role = {ROLE_OFF, ROLE_RTE, ROLE_RTE, ROLE_RTE,
ROLE_OFF <repeats 252 times>}, process_type = RTE_PROC_PRIMARY, iova_mode = RTE_IOVA_DC,
mem_config = 0x7fb4a0e000}
(gdb) p rte_config.mem_config
$3 = (struct rte_mem_config *) 0x7fb4a0e000
(gdb) p /x rte_config.mem_config.mem_cfg_addr
$7 = 0x7fb4a0e000
3、eal_hugepage_info_init() : 读取系统中的hugepage的信息。
涉及结构体:
/*
* internal configuration structure for the number, size and
* mount points of hugepages
*/
struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
//一个大页内存文件大小
char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */
//大页内存挂载点
uint32_t num_pages[RTE_MAX_NUMA_NODES];
//分配的大页内存总页数
/**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
//挂载点(即hugedir字段)对应的file descriptor
};
涉及代码块:
static int
hugepage_info_init(void)
{
DIR *dir;
struct dirent *dirent;
dir = opendir(sys_dir_path);
if (dir == NULL) {
RTE_LOG(ERR, EAL,
"Cannot open directory %s to read system hugepage info\n",
sys_dir_path);
return -1;
}
for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
。。。
hpi = &internal_config.hugepage_info[num_sizes];
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
/* first, check if we have a mountpoint */
if (get_hugepage_dir(hpi->hugepage_sz,
hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
uint32_t num_pages;
num_pages = get_num_hugepages(dirent->d_name);
if (num_pages > 0)
......
continue;
}
/* try to obtain a writelock */
hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
/* if blocking lock failed */
if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
}
calc_num_pages(hpi, dirent);
num_sizes++;
}
closedir(dir);
internal_config.num_hugepage_sizes = num_sizes;
/* sort the page directory entries by size, largest to smallest */
qsort(&internal_config.hugepage_info[0], num_sizes,
sizeof(internal_config.hugepage_info[0]), compare_hpi);
}
在linux系统中,会打开系统目录/sys/kernel/mm/hugepages,遍历每一个目录项下获取系统支持的hugepage size。然后从/proc/mounts中根据hugepage size获取对应挂载点(mount point), 然后计算在不同socket中每一种free hugepage的数量,。将每一种大页的相关信息存放在internal_config->hugepage_info中。然后会在runtime dir下创建一个名字为hugepage_info的文件,将internal_config->hugepage_info写入到该文件。
gdb下看到的信息:
(gdb) p internal_config.hugepage_info
$9 = {hugepage_sz = 2097152, hugedir = "/mnt/hugetlbfs", '\000' <repeats 4081 times>, num_pages = {6667, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = 10}
(gdb) p 2097152/1024/1024
$12 = 2
# ls /sys/kernel/mm/hugepages/
hugepages-2048kB
# cat /proc/mounts | grep hugetlbfs
none /mnt/hugetlbfs hugetlbfs rw,relatime 0 0
# cat /proc/meminfo | grep Huge
AnonHugePages: 88064 kB
HugePages_Total: 6667
HugePages_Free: 6667
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
# ls /var/run/dpdk/rte/
config hugepage_info mp_socket
# cat /var/run/dpdk/rte/hugepage_info
/mnt/hugetlbfs
4、rte_eal_memzone_init()初始化内存域
涉及的结构体:
struct rte_memzone {
#define RTE_MEMZONE_NAMESIZE 32 /**< Maximum length of memory zone name.*/
char name[RTE_MEMZONE_NAMESIZE]; /**< Name of the memory zone. */
size_t len; /**< Length of the memzone. */
uint64_t hugepage_sz; /**< The page size of underlying memory */
int32_t socket_id; /**< NUMA socket ID. */
uint32_t flags; /**< Characteristics of this memzone. */
} __attribute__((__packed__));
struct rte_fbarray {
char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
unsigned int count; /**< number of entries stored */
unsigned int len; /**< current length of the array */
unsigned int elt_sz; /**< size of each element */
void *data; /**< data pointer */
rte_rwlock_t rwlock; /**< multiprocess lock */
};
涉及的代码块:
int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
rte_fbarray_init(&mcfg->memzones, "memzone",
RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
rte_fbarray_attach(&mcfg->memzones)) {
}
}
初始化mcfg->memzones, 申请一块内存空间,用于保存以后内存分配时使用到的struct memzone,后面memzone所使用的内存空间也是从rte_heap中分配的;
GDB下看到的信息:
(gdb) p rte_config.mem_config.memzones
$17 = {name = "memzone", '\000' <repeats 56 times>, count = 0, len = 2560, elt_sz = 72, data = 0x100000000,
rwlock = {cnt = 0}}
5、rte_eal_memory_init() : 内存初始化过程的核心
先后调用了:
----memseg_primary_init()
----eal_memalloc_init()
----rte_eal_hugepage_init()
----rte_eal_memdevice_init().
1) memseg_primary_init() 初始化memsegs list
涉及的结构体:
struct rte_memseg_list {
RTE_STD_C11
union {
void *base_va;
/**< Base virtual address for this memseg list. */
uint64_t addr_64;
/**< Makes sure addr is always 64-bits */
};
//指向一块用于存放rte_memseg的内存空间
uint64_t page_sz; /**< Page size for all memsegs in this list. */
int socket_id; /**< Socket ID for all memsegs in this list. */
······
size_t len; /**< Length of memory area covered by this memseg list. */
//指明具有base_va所指向的内存空间的字节数总量
······
struct rte_fbarray memseg_arr;
//用于管理base_va指向的内存空间,包含rte_memseg相关的元数据
};
涉及的代码块:
/* limit number of segment lists according to our maximum */
n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
/* create all segment lists */
for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
msl = &mcfg->memsegs[msl_idx++];
if (alloc_memseg_list(msl, pagesz, n_segs,
socket_id, cur_seglist))
goto out;
}
确定每一种类型(由socket id和page sz确定)的struct rte_memseg_list的数量,及其所包含的mem segment的数量。然后,根据确定的数量为mcfg->memsegs中的struct rte_memseg_list分配虚拟内存空间。
GDB下看到的信息:
(gdb) p rte_config.mem_config.memsegs[0]
$1 = {{base_va = 0x100200000, addr_64 = 4297064448}, page_sz = 2097152, socket_id = 0, version = 0,
len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-0", '\000' <repeats 47 times>,
count = 0, len = 8192, elt_sz = 48, data = 0x10002e000, rwlock = {cnt = 0}}}
(gdb) p rte_config.mem_config.memsegs[1]
$2 = {{base_va = 0x500400000, addr_64 = 21479030784}, page_sz = 2097152, socket_id = 0, version = 0,
len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-1", '\000' <repeats 47 times>,
count = 0, len = 8192, elt_sz = 48, data = 0x500200000, rwlock = {cnt = 0}}}
(gdb) p rte_config.mem_config.memsegs[2]
$4 = {{base_va = 0x900600000, addr_64 = 38660997120}, page_sz = 2097152, socket_id = 0, version = 0,
len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-2", '\000' <repeats 47 times>,
count = 0, len = 8192, elt_sz = 48, data = 0x900400000, rwlock = {cnt = 0}}}
(gdb) p rte_config.mem_config.memsegs[3]
$5 = {{base_va = 0xd00800000, addr_64 = 55842963456}, page_sz = 2097152, socket_id = 0, version = 0,
len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-3", '\000' <repeats 47 times>,
count = 0, len = 8192, elt_sz = 48, data = 0xd00600000, rwlock = {cnt = 0}}}
2)eal_memalloc_init() :初始化memseg list 的fd
涉及的结构体:
static struct {
int *fds; /**< dynamically allocated array of segment lock fd's */
int memseg_list_fd; /**< memseg list fd */
int len; /**< total length of the array */
int count; /**< entries used in an array */
} fd_list[RTE_MAX_MEMSEG_LISTS];
涉及的代码块:
static int
fd_list_create_walk(const struct rte_memseg_list *msl,
void *arg __rte_unused)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
unsigned int len;
int msl_idx;
if (msl->external)
return 0;
msl_idx = msl - mcfg->memsegs;
len = msl->memseg_arr.len;
return alloc_list(msl_idx, len);
}
static int
alloc_list(int list_idx, int len)
{
int *data;
int i;
/* ensure we have space to store fd per each possible segment */
data = malloc(sizeof(int) * len);
if (data == NULL) {
RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
return -1;
}
/* set all fd's as invalid */
for (i = 0; i < len; i++)
data[i] = -1;
fd_list[list_idx].fds = data;
fd_list[list_idx].len = len;
fd_list[list_idx].count = 0;
fd_list[list_idx].memseg_list_fd = -1;
return 0;
}
如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd)
如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)
GDB下看到的信息:
(gdb) p fd_list[0]
$19 = {fds = 0x7453fa0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[1]
$20 = {fds = 0x745bfb0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[2]
$21 = {fds = 0x7463fc0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[3]
$22 = {fds = 0x746bfd0, memseg_list_fd = -1, len = 8192, count = 0}
3)rte_eal_hugepage_init()初始化大页内存
涉及的结构体:
struct rte_memseg { //一个rte_memseg等同于一个hugepage
RTE_STD_C11
union {
phys_addr_t phys_addr; /**< deprecated - Start physical address. */
rte_iova_t iova; /**< Start IO address. */
};
RTE_STD_C11
union {
void *addr; /**< Start virtual address. */
uint64_t addr_64; /**< Makes sure addr is always 64 bits */
};
size_t len; /**< Length of the segment. */
uint64_t hugepage_sz; /**< The pagesize of underlying memory */
int32_t socket_id; /**< NUMA socket ID. */
uint32_t nchannel; /**< Number of channels. */
uint32_t nrank; /**< Number of ranks. */
uint32_t flags; /**< Memseg-specific flags */
} __rte_packed;
struct hugepage_file {
void *orig_va; /**< virtual addr of first mmap() */
void *final_va; /**< virtual addr of 2nd mmap() */
uint64_t physaddr; /**< physical addr */
size_t size; /**< the page size */
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
//这是第file_id个大小为size的hugepage
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
//filepath指明hugepage对应的文件
};
涉及的legacy代码块:
如果是legacy mem, 则调用eal_legacy_hugepage_init()
/* create a memseg list */
msl = &mcfg->memsegs[0];
page_sz = RTE_PGSIZE_4K;
n_segs = internal_config.memory / page_sz;
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
msl->len = internal_config.memory;
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
arr = &msl->memseg_arr;
ms = rte_fbarray_get(arr, cur_seg);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
ms->iova = (uintptr_t)addr;
else
ms->iova = RTE_BAD_IOVA;
ms->addr = addr;
ms->hugepage_sz = page_sz;
ms->socket_id = 0;
ms->len = page_sz;
rte_fbarray_set_used(arr, cur_seg);
addr = RTE_PTR_ADD(addr, (size_t)page_sz);
}
a、根据internal_config->hugepage_info初始化hugepage_file, 并且将这些hugepage_file, 根据<socket id, pagesz>对应的rte_memseg_list中的rte_memseg进行映射;
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
unsigned pages_old, pages_new;
struct hugepage_info *hpi;
/*
* we don't yet mark hugepages as used at this stage, so
* we just map all hugepages available to the system
* all hugepages are still located on socket 0
*/
hpi = &internal_config.hugepage_info[i];
/* map all hugepages available */
pages_old = hpi->num_pages[0];
pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
if (phys_addrs_available &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
} else {
/* set physical addresses for each hugepage */
if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
}
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
sizeof(struct hugepage_file), cmp_physaddr);
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
}
b、然后qsort对hugepage_file排序(使得按照页的size降序排序,同一种size按照物理地址升序排序);
c、然后find_numasocket根据internal_config->socket_mem计算hugepage在不同socket的分布;
d、之后remap_needed_hugepages循环调用remap_segment对所有的hugepage_file进行重映射,使得虚拟内存连续的mem segments在物理内存上也是连续的,并且同一个rte_memseg_list所有的mem_sgement的虚拟地址和物理地址都是单调递增。
e、接着,设置fd_list中对应的file descriptor。
这个方法会将hugepage_file写入到hugepage_data文件。
在实现的过程中采用read-ahead,目的是为了保证虚拟内存连续的mem segments在物理内存上也是连续的,同时也能够提前载入物理页,提高系统的性能。而对于nohugepage的情况,将其视为legacy, single-file mode,采用的页的大小为4K。
如果是dynamic mem, 则调用eal_hugepage_init()
涉及代码块:
for (hp_sz_idx = 0;
hp_sz_idx < (int)internal_config.num_hugepage_sizes;
hp_sz_idx++) {
for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
socket_id++) {
struct rte_memseg **pages;
struct hugepage_info *hpi = &used_hp[hp_sz_idx];
unsigned int num_pages = hpi->num_pages[socket_id];
int num_pages_alloc, i;
pages = malloc(sizeof(*pages) * num_pages);
num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
num_pages, hpi->hugepage_sz,
socket_id, true);
if (num_pages_alloc < 0) {
free(pages);
return -1;
}
/* memalloc is locked, so it's safe to use thread-unsafe version */
ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
page_sz = (size_t)msl->page_sz;
msl_idx = msl - mcfg->memsegs;
cur_msl = &mcfg->memsegs[msl_idx];
need = wa->n_segs;
/* try finding space in memseg list */
cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
for (i = 0; i < need; i++, cur_idx++) {
struct rte_memseg *cur;
void *map_addr;
cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
map_addr = RTE_PTR_ADD(cur_msl->base_va,
cur_idx * page_sz);
if (alloc_seg(cur, map_addr, wa->socket, wa->hi)
ms->addr = addr;
ms->hugepage_sz = alloc_sz;
ms->len = alloc_sz;
ms->nchannel = rte_memory_get_nchannel();
ms->nrank = rte_memory_get_nrank();
ms->iova = iova;
ms->socket_id = socket_id;
根据socket_mem的需求,计算hugepage在不同socket的分布。然后使用了eal_memalloc_alloc_seg_bluk–alloc_seg_walk–alloc_seg进行分配,
由于这个方法是一个一个mem segment进行分配,所以不能保证分配完成后,虚拟空间上连续的mem segments在物理上也是连续的.
采用了pre-allocate,能够提高系统的性能。
4) rte_eal_memdevice_init()
设置mcfg->nchannel, mcfg->nrank
(gdb) p (struct rte_memseg)(rte_config.mem_config.memsegs.memseg_arr.data)
$32 = {{phys_addr = 4295155712, iova = 4295155712}, {addr = 0x0, addr_64 = 0}, len = 21479030784,
hugepage_sz = 2097152, socket_id = 0, nchannel = 0, nrank = 0, flags = 4}
1
2
3
6、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。其中每一个socket会对应一个heap。
初始化完成后heap的结构如下(一个例子):
假设系统支持两种大小的hugepage(2MB, 1GB)
上图的heap包含两个rte_memseg_list, 每一个都包含3个contiguous mem segments(其中可能包含一个或多个hugepage), 总共有6个contiguous mem segments(图中浅黄色的部分). 每一个contigous mem segments都包含一个malloc_elem,用于记录此contiguous mem segments的元数据。每一个struct malloc_heap都会指向第一个malloc_elem和最后一个malloc_elem;并且一个heap中,所有的malloc_elem会组成一个双向链表。
二、对于secondary process的内存初始化过程:
1、rte_config_init()
使用mmap()将config文件映射到此进程的mcfg,这样可以直接读取primary process的内存映像.
**2、eal_hugepage_info_init() **
读取hugepage文件的内容,并保存在struct internal_config->hugepage_info中。
3、rte_eal_memzone_init()
根据config文件中关于memzones的内容, 创建一个和primary process具有相同内存映像的mcfg->memzones
4、rte_eal_memory_init()
这是内存初始化过程的核心,其中包括了
----memseg_secondary_init(),
----eal_memalloc_init(),
----rte_eal_hugepage_attach(),
----rte_eal_memdevice_init().
1) memseg_secondary_init() :
直接根据config文件的内容创建和primary process相同的虚拟内存空间视图。
2)eal_memalloc_init() :
对mcfg中的struct rte_memseg_list, 创建一个本地副本(即local_memsegs),用于同步memory hotplug初始化struct fd_list,如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd);如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)。
3)rte_eal_hugepage_attach():
如果是legacy mem, eal_legacy_hugepage_attach()
读取hugepage_data文件,根据文件的内容建立与primary process相应的内存映像,并且设置fd_list中相应的file descriptor。如果是dynamic mem, eal_hugepage_attach()调用eal_memalloc_sync_with_primary(), 将primary process的mcfg->memsegs同步到此进程的local_memsegs。
4)rte_eal_memdevice_init() :
不做任何操作。
5、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。
三、总结
1、如果没有采用hugetlbfs,则默认采用系统页(大小为4K)
2、DPDK有两种内存模式 :
legacy mode : 保证虚拟空间连续的contiguous mem segments在物理空间上也是连续的
dynamic mode : 分配hugepage时是一个一个分配的,不能和legacy mode有一样的保证
3、DPDK在memalloc时有两种模式single-file-segments, page-per-file, 每一种都在hugetlbfs的挂载点上有相应的文件形式(即存在于内存中的文件),这样在内存分配时可以使用对file descriptor操作的系统调用对内存进行操作。
4、每一个socket有一个heap, 每一个heap包含若干个rte_memseg_list, 每一个rte_memseg_list包含若干rte_memseg, 一个rte_memseg对应于一个memory page。
5、在分配内存时,采用了read-ahead, pre-allocated等方法,能够减少由于页错误而阻塞的情况,提高系统的性能。