而另外一个解决真共享的方法是使用TLS(Thread Local Storage,TLS)。通过TLS在给定的多线程下,每个线程都可以分配内存来存储该线程内的独占数据,线程之间便可不竞争全局变量。在C++使用thread_local关键字修饰某个变量为线程独占,而在Rust下使用 thread_local 宏可以初始化线程局部变量,然后在线程内部使用该变量的 with 方法获取变量值,或者使用attribute的的#[thread_local]进行标记,或者,可以使用thread_local第三方库,具体例子见下面代码:
/* * According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the * system counter is at least 56 bits wide; from Armv8.6, the counter * must be 64 bits wide. So the system counter could be less than 64 * bits wide and it is attributed with the flag 'cap_user_time_short' * is true. */ asmvolatile("mrs %0, cntvct_el0" : "=r" (val));
return val; } #endif
/* * Create a struct where reader fields share a cacheline with the hot lock field. * Compiling with -DNO_FALSE_SHARING inserts padding to avoid that sharing. */ typedefstruct _buf { long lock0; long lock1; long reserved1; #if defined(NO_FALSE_SHARING) long pad[5]; // to keep the 'lock*' fields on their own cacheline. #else long pad[1]; // to provoke false sharing. #endif long reader1; long reader2; long reader3; long reader4; } buf __attribute__((aligned (64)));
buf buf1; buf buf2;
volatileint wait_to_begin = 1; structthread_data *thread; int max_node_num; int num_threads; char * lock_thd_name = "lock_th"; char * reader_thd_name = "reader_thd";
#define checkResults(string, val) { \ if (val) { \ printf("Failed with %d at %s", val, string); \ exit(1); \ } \ }
structthread_data { pthread_t tid; long tix; long node; char *name; };
/* * Bind a thread to the specified numa node. */ voidsetAffinity(void *parm) { volatileuint64_t rc, j; int node = ((struct thread_data *)parm)->node; char *func_name = ((struct thread_data *)parm)->name;
/* * Thread function to simulate the false sharing. * The "lock" threads will test-n-set the lock field, * while the reader threads will just read the other fields * in the struct. */ externvoid *read_write_func(void *parm) {
// Pin each thread to a numa node. setAffinity(parm);
// Wait for all threads to get created before starting. while(wait_to_begin) ;
start = rdtsc(); for(j=0; j<LOOP_CNT; j++) {
// Check for lock thread. if (*thd_name == *lock_thd_name) { __sync_lock_test_and_set(&buf1.lock0, 1 ); buf1.lock0 += 1; buf2.lock1 = 1;
} else { // Reader threads.
switch(tix % max_node_num) { volatilelong var; case0: var = *(volatileuint64_t *)&buf1.reader1; var = *(volatileuint64_t *)&buf2.reader1; break; case1: var = *(volatileuint64_t *)&buf1.reader2; var = *(volatileuint64_t *)&buf2.reader2; break; case2: var = *(volatileuint64_t *)&buf1.reader3; var = *(volatileuint64_t *)&buf2.reader3; break; case3: var = *(volatileuint64_t *)&buf1.reader4; var = *(volatileuint64_t *)&buf2.reader4; break; }; }; } // End of for LOOP_CNT loop
// Print out stats // stop = rdtsc(); int cpu = sched_getcpu(); int node = numa_node_of_cpu(cpu); printf("%ld mticks, %s (thread %d), on node %d (cpu %d).\n", (stop-start)/1000000, thd_name, tix, node, cpu);
returnNULL; }
intmain( int argc, char *argv[] ) { int i, n, rc=0;
if ( argc != 2 ) /* argc should be 2 for correct execution */ { printf( "usage: %s <n>\n", argv[0] ); printf( "where \"n\" is the number of threads per node\n"); exit(1); }
if ( numa_available() < 0 ) { printf( "NUMA not available\n" ); exit(1); }
int thread_cnt = atoi(argv[1]);
max_node_num = numa_max_node(); if ( max_node_num == 0 ) max_node_num = 1; int node_cnt = max_node_num + 1;
// Use "thread_cnt" threads per node. num_threads = (max_node_num +1) * thread_cnt;
// Create the first half of threads as lock threads. // Assign each thread a successive round robin node to // be pinned to (later after it gets created.) // for (i=0; i<=(num_threads/2 - 1); i++) { thread[i].tix = i; thread[i].node = i%node_cnt; thread[i].name = lock_thd_name; rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]); checkResults("pthread_create()\n", rc); usleep(500); }
// Create the second half of threads as reader threads. // Assign each thread a successive round robin node to // be pinned to (later after it gets created.) // for (i=((num_threads/2)); i<(num_threads); i++) { thread[i].tix = i; thread[i].node = i%node_cnt; thread[i].name = reader_thd_name; rc = pthread_create(&thread[i].tid, NULL, read_write_func, &thread[i]); checkResults("pthread_create()\n", rc); usleep(500); }
// Sync to let threads start together usleep(500); wait_to_begin = 0;
for (i=0; i <num_threads; i++) { rc = pthread_join(thread[i].tid, NULL); checkResults("pthread_join()\n", rc); }
主函数:首先检查 NUMA 是否可用,然后解析命令行参数来确定每个 NUMA 节点应创建多少线程。之后,为每个线程分配一个结构体 thread_data,用于存储线程的元数据,如线程 ID、所属 NUMA 节点和线程名称。然后创建线程,先是锁线程后是读取线程,每个线程都通过 pthread_create 调用启动。
Shared Data Cache Line Table:每一行代表一个缓存行,列出内存地址、所属NUMA节点、页面访问计数(PA cnt)、命中次数(Hitm)、以及其他的一些命中和失效统计数据。 可以看出缓存行地址分别有超过50,000次的总访问(Total records),它们在本地节点(LclHitm)和远程节点(RmtHitm)上的命中次数,这可以显示出在不同节点间的数据共享模式。
Shared Cache Line Distribution Pareto:展示缓存行命中的分布情况。可以看到远程命中(RmtHitm)和本地命中(LclHitm)的百分比,这是衡量NUMA节点之间内存访问效率的重要指标。 比如,对于第一行(#0)的缓存行,有大约30%的访问是本地命中,也有相似的百分比是远程命中,有相当比例的内存访问需要跨越 NUMA 节点,这通常会导致更高的延迟。
Code address, cycles, and symbols:列出了代码地址与其对应的 CPU 周期数,这有助于识别哪些函数可能对缓存行产生影响,并估计它们的影响大小。 例如,对于不同的代码地址,有不同的本地和远程命中次数(lcl hitm 和 rmt hitm),可以识别出哪些代码路径可能导致不必要的远程内存访问。