DPDK18.11.11内存初始化流程总结


风晓
风晓 2024-01-05 10:44:15 63711 赞同 0 反对 0
分类: 资源
前言 本篇主要是对DPDK的EAL(Environment Abstraction Layer)中内存的初始化流程进行总结,由于DPDK支持多进程应用,此篇总结主要针对primary process主进程流程进行跟踪总结,先了解下主次进程概念,如下:
 
 
1,在DPDK中,初始化由primary process完成。而其他process统称为secondary process,其可以通过读取一些文件来获取primary process的初始化信息,从而使得自身与primary process保持相同的内存映像。
 
2, DPDK采用了一种集中式控制的方式,比如在多进程的场景中,若一个secondary process要申请内存,则向primary process发起请求,由primary process完成相应操作后在通知secondary process。
 
一、初始化相关的代码调用流程
从lib/librte_eal/linux/eal/eal.c中的函数int rte_eal_init(int argc,char **argv)开始,内存的初始化调用栈依次为:
int rte_eal_init()
----eal_reset_internal_config()
----rte_config_init()
----eal_hugepage_info_init()
----rte_eal_memzone_init()
----rte_eal_memory_init()
----rte_eal_malloc_heap_init()
下面依次对这几个方面进行解析:
 
int
rte_eal_init(int argc, char **argv){
······
eal_reset_internal_config(&internal_config);
 
rte_config_init();
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
eal_hugepage_info_init() :
eal_hugepage_info_read();
······
}
······
if (rte_eal_memzone_init() < 0) { ······ }
if (rte_eal_memory_init() < 0) { ······ }
    if (rte_eal_malloc_heap_init() < 0) { ······  }
}
 
 
 
1、eal_reset_internal_config()初始化全局变量internal_config;
 
结构体主要成员定义如下:
 
struct internal_config { //DPDK的全局配置信息
volatile size_t memory;           /**< amount of asked memory */
//请求分配的内存数量
·······
volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */  
//是否允许使用hugetlbfs
 
unsigned hugepage_unlink;         /**< true to unlink backing files */ 
//是否删除hugepage文件(DPDK在memalloc时将每一个hugepage当做一个文件处理)
·······
volatile unsigned no_shconf;      /**< true if there is no shared config */ 
//是否允许共享,不允许的话primary process不会将初始化信息写入到文件 
 
volatile enum rte_proc_type_t process_type; /**< multi-process proc type */  
//用于区分primary process, 或者secondary process
 
/** true to try allocating memory on specific sockets */
volatile unsigned force_sockets; //强制在指定的socket上分配内存
volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket */  
//表示每一个socket分配的内存数量
 
volatile unsigned force_socket_limits; //设置是否限制socket分配的内存
volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES]; /**< limit amount of memory per socket */  
//每一个socket分配的内存的上限
 
uintptr_t base_virtaddr;          /**< base address to try and reserve memory from */
//从指定的虚拟地址分配内存
 
volatile unsigned legacy_mem;
//指明是legacy mode, 或者dynamic mode
 
volatile unsigned single_file_segments; 
/**< true if storing all pages within single files (per-page-size,* per-node) non-legacy mode only.*/
//指明是single-file-segments mode, 或者 page-per-file mode
 
unsigned num_hugepage_sizes;      /**< how many sizes on this system */
//系统支持的大页内存值,2M 、1G等
 
struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
//大页内存信息保存,主要初始化结构体
};
 
 
对应的初始化函数把主要成员给初始值:
 
eal_reset_internal_config(struct internal_config *internal_cfg)
{
int i;
 
internal_cfg->memory = 0;
internal_cfg->force_nrank = 0;
internal_cfg->force_nchannel = 0;
internal_cfg->hugefile_prefix = NULL;
internal_cfg->hugepage_dir = NULL;
............
internal_cfg->create_uio_dev = 0;
internal_cfg->iova_mode = RTE_IOVA_DC;
internal_cfg->user_mbuf_pool_ops_name = NULL;
CPU_ZERO(&internal_cfg->ctrl_cpuset);
internal_cfg->init_complete = 0;
}
 
 
GDB看到的初始化值内容:
 
(gdb) p internal_config
$1 = {memory = 0, force_nchannel = 0, force_nrank = 0, no_hugetlbfs = 0, hugepage_unlink = 0, no_pci = 0,
no_hpet = 1, vmware_tsc_map = 0, no_shconf = 0, in_memory = 0, create_uio_dev = 0,
process_type = RTE_PROC_PRIMARY, force_sockets = 0, socket_mem = {0, 0, 0, 0, 0, 0, 0, 0},
force_socket_limits = 0, socket_limit = {0, 0, 0, 0, 0, 0, 0, 0}, base_virtaddr = 0, legacy_mem = 0,
single_file_segments = 0, syslog_facility = 24, vfio_intr_mode = RTE_INTR_MODE_NONE, hugefile_prefix = 0x0,
hugepage_dir = 0x0, user_mbuf_pool_ops_name = 0x0, num_hugepage_sizes = 0, hugepage_info = {{
hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0},
lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0,
0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0, hugedir = ‘\000’ <repeats 4095 times>,
num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}, {hugepage_sz = 0,
hugedir = ‘\000’ <repeats 4095 times>, num_pages = {0, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = -1}},
iova_mode = RTE_IOVA_DC, ctrl_cpuset = {__bits = {1, 0 <repeats 15 times>}}, init_complete = 0}
(gdb)
(gdb) p internal_config.process_type
$2 = RTE_PROC_PRIMARY
 
也就是后续初始化按照主流程RTE_PROC_PRIMARY进行初始化内存;
 
2、rte_config_init() :初始化内存配置
 
涉及结构体:
 
struct rte_config { //运行时环境的配置
······
/** PA or VA mapping mode */
enum rte_iova_mode iova_mode; 
//指明了DMA使用虚拟地址(virtual address, 简称VA), 还是物理地址(physical address, 简称PA)
 
/**
* Pointer to memory configuration, which may be shared across multiple
* DPDK instances
*/
struct rte_mem_config *mem_config;
    //这个指针指向的内存空间存放了一个DPDK instance的内存分布情况
//DPDK内存初始化过程主要是初始化struct rte_mem_config中的每一项
} __attribute__((__packed__));
 
struct rte_mem_config {
volatile uint32_t magic;   /**< Magic number - Sanity check. */
/* memory topology */
uint32_t nchannel;    /**< Number of channels (0 if unknown). */
uint32_t nrank;       /**< Number of ranks (0 if unknown). */
······
/* memory segments amemnd zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */
 
//每一个struct rte_memseg_list中使用<socket id, pagesz>进行标识
//memsegs 可能存在多个具有相同<socket id, page_sz>的struct rte_memseg_list
struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< list of dynamic arrays holding memsegs */
······
/* Heaps of Malloc */
struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
    ······
 
uint64_t mem_cfg_addr; //这个地址等于struct rte_config中的struct rte_mem_config *mem_config
 
/* legacy mem and single file segments options are shared */
uint32_t legacy_mem;
//指明内存是legacy mode, 还是dynamic mode
 
uint32_t single_file_segments; 
// 指明memalloc是single-file-segments mode, 还是page-per-file mode
    ······
} __attribute__((__packed__));
 
 
涉及的代码:
 
/* Sets up rte_config structure with the pointer to shared memory config.*/
static void
rte_config_init(void)
{
rte_config.process_type = internal_config.process_type;
 
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
rte_eal_config_create();
break;
}
}
 
这个函数主要是为struct rte_config中的struct rte_mem_config *mem_config(简称mcfg)申请一块内存空间,并且在运行时目录下创建一个名字为config的文件,并且将mcfg的内容写进此文件。这样,secondary process在初始化时就能通过读取config文件来创建和primary process一样的内存映像。
 
/* create memory configuration in shared/mmap memory. Take out
 * a write lock on the memsegs, so we can auto-detect primary/secondary.
 * This means we never close the file while running (auto-close on exit).
 * We also don't lock the whole file, so that in future we can use read-locks
 * on other parts, e.g. memzones, to detect if there are running secondary
 * processes. */
 
static void
rte_eal_config_create(void)
{
void *rte_mem_cfg_addr;
int retval;
 
const char *pathname = eal_runtime_config_path();
 
/* map the config before hugepage address so that we don't waste a page */
if (internal_config.base_virtaddr != 0)
rte_mem_cfg_addr = (void *)
RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
else
rte_mem_cfg_addr = NULL;
 
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
if (mem_cfg_fd < 0)
rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
}
。。。。。。
。。。。。。
rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
 
if (rte_mem_cfg_addr == MAP_FAILED){
rte_panic("Cannot mmap memory for rte_config\n");
}
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
rte_config.mem_config = rte_mem_cfg_addr;
}
 
 
在这里采用了mmap()的方式将config文件和mcfg进行了映射,所以在后面的初始化操作中,一旦对config进行了写操作,也能够立刻反映到其他的进程中(类似于使用共享内存通信);
 
# ls /var/run/dpdk/rte/config 
/var/run/dpdk/rte/config
1
2
gdb看到的数据如下:
 
(gdb) p rte_config 
$2 = {master_lcore = 1, lcore_count = 3, numa_node_count = 1, numa_nodes = {0, 0, 0, 0, 0, 0, 0, 0}, 
  service_lcore_count = 0, lcore_role = {ROLE_OFF, ROLE_RTE, ROLE_RTE, ROLE_RTE, 
    ROLE_OFF <repeats 252 times>}, process_type = RTE_PROC_PRIMARY, iova_mode = RTE_IOVA_DC, 
  mem_config = 0x7fb4a0e000}
  
(gdb) p rte_config.mem_config 
$3 = (struct rte_mem_config *) 0x7fb4a0e000
 
(gdb) p /x rte_config.mem_config.mem_cfg_addr 
$7 = 0x7fb4a0e000
 
3、eal_hugepage_info_init() : 读取系统中的hugepage的信息。
 
涉及结构体:
 
/*
 * internal configuration structure for the number, size and
 * mount points of hugepages
 */
struct hugepage_info {
uint64_t hugepage_sz;   /**< size of a huge page */
//一个大页内存文件大小
 
char hugedir[PATH_MAX];    /**< dir where hugetlbfs is mounted */
//大页内存挂载点
 
uint32_t num_pages[RTE_MAX_NUMA_NODES];
//分配的大页内存总页数
 
/**< number of hugepages of that size on each socket */
int lock_descriptor;    /**< file descriptor for hugepage dir */
//挂载点(即hugedir字段)对应的file descriptor
};
 
 
涉及代码块:
 
static int
hugepage_info_init(void)
{
DIR *dir;
struct dirent *dirent;
 
dir = opendir(sys_dir_path);
if (dir == NULL) {
RTE_LOG(ERR, EAL,
"Cannot open directory %s to read system hugepage info\n",
sys_dir_path);
return -1;
}
 
for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
。。。
hpi = &internal_config.hugepage_info[num_sizes];
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
 
/* first, check if we have a mountpoint */
if (get_hugepage_dir(hpi->hugepage_sz,
hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
uint32_t num_pages;
 
num_pages = get_num_hugepages(dirent->d_name);
if (num_pages > 0)
......
continue;
}
 
/* try to obtain a writelock */
hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
 
/* if blocking lock failed */
if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
}
 
calc_num_pages(hpi, dirent);
 
num_sizes++;
}
closedir(dir);
 
internal_config.num_hugepage_sizes = num_sizes;
 
/* sort the page directory entries by size, largest to smallest */
qsort(&internal_config.hugepage_info[0], num_sizes,
      sizeof(internal_config.hugepage_info[0]), compare_hpi);
}
 
 
 
在linux系统中,会打开系统目录/sys/kernel/mm/hugepages,遍历每一个目录项下获取系统支持的hugepage size。然后从/proc/mounts中根据hugepage size获取对应挂载点(mount point), 然后计算在不同socket中每一种free hugepage的数量,。将每一种大页的相关信息存放在internal_config->hugepage_info中。然后会在runtime dir下创建一个名字为hugepage_info的文件,将internal_config->hugepage_info写入到该文件。
 
gdb下看到的信息:
 
(gdb) p internal_config.hugepage_info 
$9 = {hugepage_sz = 2097152, hugedir = "/mnt/hugetlbfs", '\000' <repeats 4081 times>, num_pages = {6667, 0, 0, 0, 0, 0, 0, 0}, lock_descriptor = 10}
 
(gdb) p 2097152/1024/1024
$12 = 2
 
# ls /sys/kernel/mm/hugepages/
hugepages-2048kB
 
# cat /proc/mounts | grep hugetlbfs
none /mnt/hugetlbfs hugetlbfs rw,relatime 0 0
 
# cat /proc/meminfo | grep Huge
AnonHugePages:     88064 kB
HugePages_Total:    6667
HugePages_Free:     6667
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB
 
# ls  /var/run/dpdk/rte/
config         hugepage_info  mp_socket
 
# cat  /var/run/dpdk/rte/hugepage_info 
 /mnt/hugetlbfs
 
 
4、rte_eal_memzone_init()初始化内存域
 
涉及的结构体:
 
struct rte_memzone {
 
#define RTE_MEMZONE_NAMESIZE 32       /**< Maximum length of memory zone name.*/
char name[RTE_MEMZONE_NAMESIZE];  /**< Name of the memory zone. */
size_t len;                       /**< Length of the memzone. */
uint64_t hugepage_sz;             /**< The page size of underlying memory */
int32_t socket_id;                /**< NUMA socket ID. */
uint32_t flags;                   /**< Characteristics of this memzone. */
} __attribute__((__packed__));
 
struct rte_fbarray {
char name[RTE_FBARRAY_NAME_LEN]; /**< name associated with an array */
unsigned int count;              /**< number of entries stored */
unsigned int len;                /**< current length of the array */
unsigned int elt_sz;             /**< size of each element */
void *data;                      /**< data pointer */
rte_rwlock_t rwlock;             /**< multiprocess lock */
};
 
 
涉及的代码块:
 
int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
 
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
 
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
rte_fbarray_init(&mcfg->memzones, "memzone",
RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
rte_fbarray_attach(&mcfg->memzones)) {
}
}
 
初始化mcfg->memzones, 申请一块内存空间,用于保存以后内存分配时使用到的struct memzone,后面memzone所使用的内存空间也是从rte_heap中分配的;
 
GDB下看到的信息:
 
(gdb) p rte_config.mem_config.memzones 
$17 = {name = "memzone", '\000' <repeats 56 times>, count = 0, len = 2560, elt_sz = 72, data = 0x100000000, 
  rwlock = {cnt = 0}}
 
5、rte_eal_memory_init() : 内存初始化过程的核心
先后调用了:
----memseg_primary_init()
----eal_memalloc_init()
----rte_eal_hugepage_init()
----rte_eal_memdevice_init().
 
1) memseg_primary_init() 初始化memsegs list
 
涉及的结构体:
 
struct rte_memseg_list {
RTE_STD_C11
union {
void *base_va;
/**< Base virtual address for this memseg list. */
uint64_t addr_64;
/**< Makes sure addr is always 64-bits */
};
//指向一块用于存放rte_memseg的内存空间
 
uint64_t page_sz; /**< Page size for all memsegs in this list. */
int socket_id; /**< Socket ID for all memsegs in this list. */
······
size_t len; /**< Length of memory area covered by this memseg list. */
//指明具有base_va所指向的内存空间的字节数总量
······
struct rte_fbarray memseg_arr; 
//用于管理base_va指向的内存空间,包含rte_memseg相关的元数据
};
 
 
涉及的代码块:
 
/* limit number of segment lists according to our maximum */
n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
 
/* create all segment lists */
for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
msl = &mcfg->memsegs[msl_idx++];
 
if (alloc_memseg_list(msl, pagesz, n_segs,
socket_id, cur_seglist))
goto out;
}
 
确定每一种类型(由socket id和page sz确定)的struct rte_memseg_list的数量,及其所包含的mem segment的数量。然后,根据确定的数量为mcfg->memsegs中的struct rte_memseg_list分配虚拟内存空间。
 
GDB下看到的信息:
 
(gdb) p rte_config.mem_config.memsegs[0]
$1 = {{base_va = 0x100200000, addr_64 = 4297064448}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-0", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x10002e000, rwlock = {cnt = 0}}}
 
(gdb) p rte_config.mem_config.memsegs[1]
$2 = {{base_va = 0x500400000, addr_64 = 21479030784}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-1", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x500200000, rwlock = {cnt = 0}}}
    
(gdb) p rte_config.mem_config.memsegs[2]
$4 = {{base_va = 0x900600000, addr_64 = 38660997120}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-2", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0x900400000, rwlock = {cnt = 0}}}
    
(gdb) p rte_config.mem_config.memsegs[3]
$5 = {{base_va = 0xd00800000, addr_64 = 55842963456}, page_sz = 2097152, socket_id = 0, version = 0, 
  len = 17179869184, external = 0, memseg_arr = {name = "memseg-2048k-0-3", '\000' <repeats 47 times>, 
    count = 0, len = 8192, elt_sz = 48, data = 0xd00600000, rwlock = {cnt = 0}}}
 
 
2)eal_memalloc_init() :初始化memseg list 的fd
 
涉及的结构体:
 
static struct {
int *fds; /**< dynamically allocated array of segment lock fd's */
int memseg_list_fd; /**< memseg list fd */
int len; /**< total length of the array */
int count; /**< entries used in an array */
} fd_list[RTE_MAX_MEMSEG_LISTS];
 
涉及的代码块:
 
static int
fd_list_create_walk(const struct rte_memseg_list *msl,
void *arg __rte_unused)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
unsigned int len;
int msl_idx;
 
if (msl->external)
return 0;
 
msl_idx = msl - mcfg->memsegs;
len = msl->memseg_arr.len;
 
return alloc_list(msl_idx, len);
}
 
static int
alloc_list(int list_idx, int len)
{
int *data;
int i;
 
/* ensure we have space to store fd per each possible segment */
data = malloc(sizeof(int) * len);
if (data == NULL) {
RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
return -1;
}
/* set all fd's as invalid */
for (i = 0; i < len; i++)
data[i] = -1;
 
fd_list[list_idx].fds = data;
fd_list[list_idx].len = len;
fd_list[list_idx].count = 0;
fd_list[list_idx].memseg_list_fd = -1;
 
return 0;
}
 
 
如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd)
如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)
 
GDB下看到的信息:
 
(gdb) p fd_list[0]
$19 = {fds = 0x7453fa0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[1]
$20 = {fds = 0x745bfb0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[2]
$21 = {fds = 0x7463fc0, memseg_list_fd = -1, len = 8192, count = 0}
(gdb) p fd_list[3]
$22 = {fds = 0x746bfd0, memseg_list_fd = -1, len = 8192, count = 0}
 
3)rte_eal_hugepage_init()初始化大页内存
 
涉及的结构体:
 
struct rte_memseg { //一个rte_memseg等同于一个hugepage
RTE_STD_C11
union {
phys_addr_t phys_addr;  /**< deprecated - Start physical address. */
rte_iova_t iova;        /**< Start IO address. */
};
RTE_STD_C11
union {
void *addr;             /**< Start virtual address. */
uint64_t addr_64;       /**< Makes sure addr is always 64 bits */
};
size_t len;                 /**< Length of the segment. */
uint64_t hugepage_sz;       /**< The pagesize of underlying memory */
int32_t socket_id;          /**< NUMA socket ID. */
uint32_t nchannel;          /**< Number of channels. */
uint32_t nrank;             /**< Number of ranks. */
uint32_t flags;             /**< Memseg-specific flags */
} __rte_packed;
 
struct hugepage_file {
void *orig_va;      /**< virtual addr of first mmap() */
void *final_va;     /**< virtual addr of 2nd mmap() */
uint64_t physaddr;  /**< physical addr */
size_t size;        /**< the page size */
int socket_id;      /**< NUMA socket ID */
int file_id;        /**< the '%d' in HUGEFILE_FMT */ 
//这是第file_id个大小为size的hugepage
 
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
//filepath指明hugepage对应的文件
};
 
 
涉及的legacy代码块:
如果是legacy mem, 则调用eal_legacy_hugepage_init()
 
        /* create a memseg list */
msl = &mcfg->memsegs[0];
 
page_sz = RTE_PGSIZE_4K;
n_segs = internal_config.memory / page_sz;
 
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
msl->len = internal_config.memory;
 
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
arr = &msl->memseg_arr;
 
ms = rte_fbarray_get(arr, cur_seg);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
ms->iova = (uintptr_t)addr;
else
ms->iova = RTE_BAD_IOVA;
ms->addr = addr;
ms->hugepage_sz = page_sz;
ms->socket_id = 0;
ms->len = page_sz;
 
rte_fbarray_set_used(arr, cur_seg);
 
addr = RTE_PTR_ADD(addr, (size_t)page_sz);
}
 
 
a、根据internal_config->hugepage_info初始化hugepage_file, 并且将这些hugepage_file, 根据<socket id, pagesz>对应的rte_memseg_list中的rte_memseg进行映射;
 
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
unsigned pages_old, pages_new;
struct hugepage_info *hpi;
 
/*
* we don't yet mark hugepages as used at this stage, so
* we just map all hugepages available to the system
* all hugepages are still located on socket 0
*/
hpi = &internal_config.hugepage_info[i];
 
/* map all hugepages available */
pages_old = hpi->num_pages[0];
pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
 
if (phys_addrs_available &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
} else {
/* set physical addresses for each hugepage */
if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
}
 
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
 
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
      sizeof(struct hugepage_file), cmp_physaddr);
 
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
}
 
 
b、然后qsort对hugepage_file排序(使得按照页的size降序排序,同一种size按照物理地址升序排序);
 
c、然后find_numasocket根据internal_config->socket_mem计算hugepage在不同socket的分布;
 
d、之后remap_needed_hugepages循环调用remap_segment对所有的hugepage_file进行重映射,使得虚拟内存连续的mem segments在物理内存上也是连续的,并且同一个rte_memseg_list所有的mem_sgement的虚拟地址和物理地址都是单调递增。
e、接着,设置fd_list中对应的file descriptor。
这个方法会将hugepage_file写入到hugepage_data文件。
 
在实现的过程中采用read-ahead,目的是为了保证虚拟内存连续的mem segments在物理内存上也是连续的,同时也能够提前载入物理页,提高系统的性能。而对于nohugepage的情况,将其视为legacy, single-file mode,采用的页的大小为4K。
 
如果是dynamic mem, 则调用eal_hugepage_init()
 
涉及代码块:
 
for (hp_sz_idx = 0;
hp_sz_idx < (int)internal_config.num_hugepage_sizes;
hp_sz_idx++) {
for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
socket_id++) {
struct rte_memseg **pages;
struct hugepage_info *hpi = &used_hp[hp_sz_idx];
unsigned int num_pages = hpi->num_pages[socket_id];
int num_pages_alloc, i;
pages = malloc(sizeof(*pages) * num_pages);
num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
num_pages, hpi->hugepage_sz,
socket_id, true);
if (num_pages_alloc < 0) {
free(pages);
return -1;
}
 
/* memalloc is locked, so it's safe to use thread-unsafe version */
ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
 
page_sz = (size_t)msl->page_sz;
 
msl_idx = msl - mcfg->memsegs;
cur_msl = &mcfg->memsegs[msl_idx];
 
need = wa->n_segs;
 
/* try finding space in memseg list */
cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
for (i = 0; i < need; i++, cur_idx++) {
struct rte_memseg *cur;
void *map_addr;
 
cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
map_addr = RTE_PTR_ADD(cur_msl->base_va,
cur_idx * page_sz);
 
if (alloc_seg(cur, map_addr, wa->socket, wa->hi)
 
ms->addr = addr;
ms->hugepage_sz = alloc_sz;
ms->len = alloc_sz;
ms->nchannel = rte_memory_get_nchannel();
ms->nrank = rte_memory_get_nrank();
ms->iova = iova;
ms->socket_id = socket_id;
 
 
根据socket_mem的需求,计算hugepage在不同socket的分布。然后使用了eal_memalloc_alloc_seg_bluk–alloc_seg_walk–alloc_seg进行分配,
由于这个方法是一个一个mem segment进行分配,所以不能保证分配完成后,虚拟空间上连续的mem segments在物理上也是连续的.
采用了pre-allocate,能够提高系统的性能。
 
4) rte_eal_memdevice_init()
设置mcfg->nchannel, mcfg->nrank
 
(gdb) p (struct rte_memseg)(rte_config.mem_config.memsegs.memseg_arr.data)
$32 = {{phys_addr = 4295155712, iova = 4295155712}, {addr = 0x0, addr_64 = 0}, len = 21479030784, 
  hugepage_sz = 2097152, socket_id = 0, nchannel = 0, nrank = 0, flags = 4}
1
2
3
6、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。其中每一个socket会对应一个heap。
初始化完成后heap的结构如下(一个例子):
 
假设系统支持两种大小的hugepage(2MB, 1GB)
上图的heap包含两个rte_memseg_list, 每一个都包含3个contiguous mem segments(其中可能包含一个或多个hugepage), 总共有6个contiguous mem segments(图中浅黄色的部分). 每一个contigous mem segments都包含一个malloc_elem,用于记录此contiguous mem segments的元数据。每一个struct malloc_heap都会指向第一个malloc_elem和最后一个malloc_elem;并且一个heap中,所有的malloc_elem会组成一个双向链表。
 
二、对于secondary process的内存初始化过程:
1、rte_config_init()
使用mmap()将config文件映射到此进程的mcfg,这样可以直接读取primary process的内存映像.
 
**2、eal_hugepage_info_init() **
读取hugepage文件的内容,并保存在struct internal_config->hugepage_info中。
 
3、rte_eal_memzone_init()
根据config文件中关于memzones的内容, 创建一个和primary process具有相同内存映像的mcfg->memzones
 
4、rte_eal_memory_init()
这是内存初始化过程的核心,其中包括了
----memseg_secondary_init(),
----eal_memalloc_init(),
----rte_eal_hugepage_attach(),
----rte_eal_memdevice_init().
 
1) memseg_secondary_init() :
直接根据config文件的内容创建和primary process相同的虚拟内存空间视图。
 
2)eal_memalloc_init() :
对mcfg中的struct rte_memseg_list, 创建一个本地副本(即local_memsegs),用于同步memory hotplug初始化struct fd_list,如果是single-file-segments mode, 则对于一个rte_memseg_list,只使用一个file descriptor(fd_list中的memseg_list_fd);如果是file-per-page, 则对于一个rte_memseg_list中的每一个mem segment, 都会使用一个file descriptor(fd_list中的fds)。
 
3)rte_eal_hugepage_attach():
如果是legacy mem, eal_legacy_hugepage_attach()
读取hugepage_data文件,根据文件的内容建立与primary process相应的内存映像,并且设置fd_list中相应的file descriptor。如果是dynamic mem, eal_hugepage_attach()调用eal_memalloc_sync_with_primary(), 将primary process的mcfg->memsegs同步到此进程的local_memsegs。
 
4)rte_eal_memdevice_init() :
不做任何操作。
 
5、rte_eal_malloc_heap_init()
初始化mcfg->malloc_heaps;并且注册进程间通信的handle,用于多进程环境下的内存分配;初始化heap的结构。
 
三、总结
1、如果没有采用hugetlbfs,则默认采用系统页(大小为4K)
2、DPDK有两种内存模式 :
 
legacy mode : 保证虚拟空间连续的contiguous mem segments在物理空间上也是连续的
dynamic mode : 分配hugepage时是一个一个分配的,不能和legacy mode有一样的保证
 
3、DPDK在memalloc时有两种模式single-file-segments, page-per-file, 每一种都在hugetlbfs的挂载点上有相应的文件形式(即存在于内存中的文件),这样在内存分配时可以使用对file descriptor操作的系统调用对内存进行操作。
 
4、每一个socket有一个heap, 每一个heap包含若干个rte_memseg_list, 每一个rte_memseg_list包含若干rte_memseg, 一个rte_memseg对应于一个memory page。
 
5、在分配内存时,采用了read-ahead, pre-allocated等方法,能够减少由于页错误而阻塞的情况,提高系统的性能。

如果您发现该资源为电子书等存在侵权的资源或对该资源描述不正确等,可点击“私信”按钮向作者进行反馈;如作者无回复可进行平台仲裁,我们会在第一时间进行处理!

评价 0 条
风晓L1
粉丝 1 资源 2038 + 关注 私信
最近热门资源
麒麟系统版本介绍白皮书  520
MiSans 阿拉伯语字体文件  464
解决新版本麒麟系统中微信打开白屏显示  404
麒麟系统进行系统监控,查看进程的运行时间来优化性能  336
临时关闭swap分区与永久关闭swap分区(注意必须确保系统有足够内存运行!)  229
统信桌面专业版添加字体  222
统信uos单一程序黑屏,任务栏正常显示解决办法  222
统信uos快捷键文档  192
统信系统双无线网卡设置关闭开启单一网卡  146
分享一个磁盘恢复工具,适用于多平台(包括统信)  126
最近下载排行榜
麒麟系统版本介绍白皮书 0
MiSans 阿拉伯语字体文件 0
解决新版本麒麟系统中微信打开白屏显示 0
麒麟系统进行系统监控,查看进程的运行时间来优化性能 0
临时关闭swap分区与永久关闭swap分区(注意必须确保系统有足够内存运行!) 0
统信桌面专业版添加字体 0
统信uos单一程序黑屏,任务栏正常显示解决办法 0
统信uos快捷键文档 0
统信系统双无线网卡设置关闭开启单一网卡 0
分享一个磁盘恢复工具,适用于多平台(包括统信) 0
作者收入月榜
1

prtyaa 收益399.62元

2

zlj141319 收益236.11元

3

IT-feng 收益219.61元

4

1843880570 收益214.2元

5

风晓 收益208.24元

6

哆啦漫漫喵 收益204.5元

7

777 收益173.07元

8

Fhawking 收益106.6元

9

信创来了 收益106.03元

10

克里斯蒂亚诺诺 收益91.08元

请使用微信扫码

添加我为好友,拉您入交流群!

请使用微信扫一扫!