Performance Engineering FAQ

As per few requests. I am posting FAQ of PSR (Performance,Scalablity, Reliability). I dont want to focus on basic questions like what is psr? why psr? . here the focus is on technical questihons .

1) while monitoring memory for a 64bit process do u track RSS or VM ? . what do u think if RSS is less but VM is more on Linux-x64.

Ans)The VmLib or VmExe memory could be more. do  $ cat /proc/pid/status and see which section of memory is taking more memory.

2) how do you take memory break-up of Linux process .

Ans)cat /proc/pid/status. VmData gives you heap section, VmLib gives you shared libs section, VmExe gives Exe size in memory.

3) when you observe a process hang , how do you debug the issue.

Ans)take a thread dump using pstack on linux . do a  $ pstack > output.txt . the output.txt has the threaddump. see on which lock ,all the threads are waiting. if it is deadlock then debug the lock issues and optimize the lock/algorithms and rfe-run your application. 

4) when the process cpu is high for your app, how do you find the bottlenecks in your app.

Ans) This could be  due to cpu bottleneck because of your algorithms . run valgrind --tool=callgrind tool for cpu graph profiling and identify method which is taking more cpu. solve the method/functions which are taking more cpu.

5) how do u find how number of threads given process id.

Ans) cat /proc/pid/status | grep Threads. this will give number of threads taken by your app.

6) if the kernel cpu is very high for an application, what does it indicate.

Ans)your Application could be using more system calls. launch strace to find out system calls used by your app.
strace -c -o output.txt ./application . this logs all systemcalls  and time spent in system calls in output.txt by your ./application binary.

7)what is  diff b/w 32bit and 64bit cpu architecture. does 64bit process is faster than 32bit  ?

Ans) using 64bit machine , the advantage comes out when you need to access memory more than 4GB. 64 bit architectures include LP64,LLP64,ILP64 . where L is long,I is integer , P is for pointer. most platforms use LP64 which has long, pointer occupy 8 bytes. 32bit process cant go beyond 4 GB memory because memory addressing is just 32bit. many times 64bit process is faster than 32bit when it comes to memory operations.


get call stack programatically for running process - Linux x64

I happened to read about libunwind library from HP LABS which is pretty much ported to all Linux platforms. the libunwind library unwinds the callstack of process.  we need to use ptrace to attach to process, it suspends the process and attaches and then  we need to create the address space for remote process and use libunwind to unwind the stack.

here I post the code that pretty much works as pstack on Linux x64 - for 64bit processes only .



















#define false 0

static int verbose_option = 0;
static int debug_option = 0;

static int wait_loops = 20;
static int wait_time = 100;

static int pointer_size = 8;

typedef Elf64_Addr TARGET_ADDRESS;


typedef struct _process_info {
int pid;
int threads_present_flag;
long *thread_pids;
} process_info;


static void msleep(int msecs)
{
usleep(msecs*1000);
}

static int attach_target(int thepid)
{
int ret;
int waitstatus;
int x;

if (debug_option) printf("Attaching to the target process...\n");
ret = ptrace(PTRACE_ATTACH, thepid, NULL, NULL);

if (0 != ret && 0 != errno) {
ret = errno;
return ret;
}
/* ptrace(PTRACE_ATTACH) does the equivalent of sending a SIG_STOP to the target.
  So we should wait for that signal to be handled before proceeding.
*/
if (debug_option) printf("Waiting for target process to stop...\n");
x = 0;
while (x < wait_loops) {
ret = waitpid(thepid, &waitstatus, WUNTRACED | WNOHANG);
if (debug_option) {
printf("waitpid after attach returned: %d, status=%d\n",ret, waitstatus);
}
if (WIFSTOPPED(waitstatus)) {
return 0;
}
msleep(wait_time); /* Sleep for a bit so we don't busy wait */
x++;
}
if (debug_option) printf("Target process has stopped.\n");

/* If we did attach, install a signal handler to allow us to detatch if we're interrupted */
if (0 == errno) {
/* Try to catch the following signals:
SIGINT, SIGSEV,
*/
}

return errno;
}

static int attach_thread(long threadpid)
{
int ret;
int waitstatus;

if (debug_option) printf("Attaching to the target thread %ld...\n", threadpid);
ret = ptrace(PTRACE_ATTACH, threadpid, NULL, NULL);

if (0 != ret && 0 != errno) {
perror("ptrace(PTRACE_ATTACH)");
return errno;
}
while (1) {
ret = waitpid(threadpid, &waitstatus, __WCLONE);
if (ret > 0) {
break;
}
}

return errno;
}

static int detatch_target(process_info *pi)
{
int ret;
if (pi->threads_present_flag) {
int thread_pid = 0;
int x = 0;
if (debug_option) printf("Detatching from threads...\n");
for (x = 1; (pi->thread_pids)[x];x++) {
thread_pid = (pi->thread_pids)[x];
if (debug_option) printf("Detatching from thread %d\n", thread_pid);
ret = ptrace(PTRACE_CONT, thread_pid, 1, 0);
if (debug_option) printf("ptrace(PTRACE_CONT) returned: %d\n", ret);
}
}
if (debug_option) printf("Detaching from target...\n");
ret = ptrace(PTRACE_CONT, pi->pid, 1, 0);
if (debug_option) printf("ptrace64(PTRACE_CONT) returned: %d\n", ret);
return ret;
}

process_info *pi_alloc(int pid)
{
process_info* ret = (process_info*)calloc(sizeof(process_info),1);
if (NULL != ret) {
ret->pid = pid;
}
return ret;
}

void pi_free(process_info *pi)
{
free(pi);
}


int grok_and_print_thread_stack(process_info *pi, int thepid)
{
return unwind_thread_callstack(thepid);

}

int unwind_thread_callstack(int thetid)
{
unw_cursor_t c;
unw_word_t ip;
unw_addr_space_t as;
struct UPT_info *ui;
char buf[512];
int ret;
pid_t pid;

as = unw_create_addr_space(&_UPT_accessors,0);
ui = _UPT_create(thetid);
unw_init_remote(&c,as,ui);

do {
unw_get_proc_name(&c,buf,sizeof(buf),NULL);
printf("%s\n",buf);
}
while((ret = unw_step(&c)) > 0);

_UPT_destroy(ui);
ptrace(PTRACE_DETACH,thetid,0,0);

}




int grok_get_threads(process_info *pi)
{
  int threads_present=0; int loop=0,numofth=0,tid=0;
char pids[16];
long thread_pids[10000];//handles 10,000 threads . TO DO have a linked list here instead of array
struct dirent *tids;
DIR *dp;
char *format_string = "/proc/%d/task";
        char *tasksdir = calloc(strlen(format_string) + 10 ,1);
sprintf(tasksdir,format_string,pi->pid);

dp=opendir(tasksdir);
if(dp==NULL){
perror("cant open /proc/pid/task dir");
}
while (tids=readdir(dp))
{
if(strcmp(tids->d_name, " ")!=0 && strlen(tids->d_name) > 0 )
     {
if(debug_option)
printf("\n loop i s%d",loop);
strcpy(pids,tids->d_name);
pids[strlen(tids->d_name)]='\0';
if(strcmp(pids,".") !=0 && strcmp(pids,"..") !=0 && strlen(pids) > 1 ){
  thread_pids[loop]=atol(pids);
  if(thread_pids[loop] != pi->pid )
  attach_thread(thread_pids[loop]);
  if(debug_option)
  printf("\n tid is %ld", thread_pids[loop]);
  loop++;
         }
     }
}
thread_pids[loop]='\0';
closedir(dp);

if(loop > 1){
pi->threads_present_flag = 1;
}

pi->thread_pids=thread_pids ;
for(tid=0;(pi->thread_pids)[tid];tid++){
printf("\n----------tid%ld----------\n",pi->thread_pids[tid]);
unwind_thread_callstack(pi->thread_pids[tid]);
}
}


static void fatal(char* s)
{
fprintf(stderr,"vstack: fatal error: %s\n",s);
exit(0);
}

static void usage()
{
printf("vstack: [-v] [-D] \n");
exit(1);
}

int main(int argc, char** argv)
{
/* look for command line options */
int pid = 0;
int ret = 0;
process_info *pi = NULL;
int option_position = 1;

while ( option_position < (argc-1) && *argv[option_position] == '-') {
switch (*(argv[option_position]+1)) {
case 'v':
verbose_option = 1;
break;
case 'D':
debug_option = 1;
break;
default:
usage();
break;
}
option_position++;
}
if (option_position != (argc-1) ) {
usage();
}
pid = atoi(argv[option_position]);
if (0 == pid) {
usage();
}

if (debug_option) {
printf("verbose option: %s\n",verbose_option?"on":"off");
printf("pid: %d\n",pid);
}

/* check that the pesky user hasn't tried to lsstack himself */
if (pid == getpid() ) {
fprintf(stderr,"Error: specified pid belongs to the lsstack process\n");
exit(1);
}

/* See if we can attach to the target */
ret = attach_target(pid);

if (ret) {
fprintf(stderr,"Failed to attach to the target process: %s\n", strerror(ret) );
exit(1);
}

if (debug_option) printf("Attached to target process\n");

pi = pi_alloc(pid);


// get the tids from /proc/pid/tasks for Linux64 . the tasks dir has entries for each threadid
ret=grok_get_threads(pi);

if (!(pi->threads_present_flag))
ret=grok_and_print_thread_stack(pi, pi->pid) ;

// detach target to continue itself , otherwise process will be in suspend state
detatch_target(pi);

pi_free(pi);

if (debug_option) printf("Detatched from target process\n");

return 0;
}

compile with $gcc vstack.c -m64 -lunwind-x86_64 -lunwind-ptrace -o vstack