Chapter 20. Tuning

Table of Contents

1. Memory
2. I/O
3. todo

1. Memory

1.1. Monitor the memory

watch -n 1 -d grep ^Commit /proc/meminfo
watch -n 1 -d free -m
ps -eo vsz,rss,pid,args | sed 1d | sort -n
sar -r
sar -S
vmstat 1

1.2. Allocate memory

Program overcommit:

/*
 *
 *   overcommit.c
 *
 *   Allocate memory given the size in argument.
 *
 *   Guillaume Kielwasser 2013/10/12
 *
 */

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

#define VERSION "0.1"

char *prog_name = NULL;

int usage ()
{
  fprintf (stderr, "Usage: %s [-Vh] [-m] mem_Mb\n", prog_name);
  exit (EXIT_FAILURE);
}

int main (int argc, char **argv)
{
  int opt;
  int V_FLAG = 0, H_FLAG = 0, M_FLAG = 0;
  char *version = VERSION;
  char *ptr = NULL;
  size_t mem = 0;
  unsigned long i = 0;

  prog_name = argv[0];

  while ((opt = getopt (argc, argv, "Vhm")) != -1) {
    switch (opt) {
      case 'V':
        V_FLAG = 1;
        break;
      case 'h':
        H_FLAG = 1;
        break;
      case 'm':
        M_FLAG = 1;
        break;
      default:
        usage ();
    }
  }

  if (V_FLAG) {
    printf ("%s version %s\n", prog_name, version);
    return (EXIT_SUCCESS);
  }

  if (H_FLAG)
    usage ();

  if (argc != optind + 1)
    usage ();

  if ((mem = 1024*1024 * (size_t) atoi (argv[optind])) == 0) {
    fprintf (stderr, "%s: %s not a number\n",
      prog_name, argv[optind]);
    exit (EXIT_FAILURE);
  }

  if ((ptr = (char *) malloc (mem)) == NULL) {
    fprintf (stderr, "%s: malloc %lu failed (%s)\n",
      prog_name, mem, strerror (errno));
    exit (EXIT_FAILURE);
  }

  if (M_FLAG) {
    for (i=0; i<mem; i++)
      memset (ptr++, 0, 1);
  }

  while (1) {
    sleep (1);
  }

  return (EXIT_SUCCESS);
}

Makefile:

CC = gcc

CFLAGS = -s -Wall -O3

overcommit: overcommit.c
        $(CC) $(CFLAGS) -o overcommit overcommit.c

1.3. Over commit

1.3.1. Heuristic overcommit

Set the overcommit_memory tunable to 0:

sysctl -w vm.overcommit_memory=0

Obvious overcommits of address space are refused. Allows overcommit to reduce swap usage. This feature can be very useful because there are a lot of programs that malloc() huge amounts of memory "just-in-case" and don't use much of it.

Example on a system with 2Gb of RAM and 2Gb of swap:

$ grep -E 'MemTotal:|SwapTotal:' /proc/meminfo
MemTotal:        1986160 kB
SwapTotal:       2030588 kB

asking 3Gb of memory works:

$ ./overcommit 3000

but asking for 3.5Gb is not permitted (considered here as obvious overcommit):

$ ./overcommit 3500
./overcommit: malloc 3670016000 failed (Cannot allocate memory)

FYI: /proc/meminfo CommitLimit has nothing to do with the obvious limit calculation; it only servers in "never overcommit" mode.

1.3.1.1. Allocate and use the memory
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1148268 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        821       1118          0         55        419
-/+ buffers/cache:        345       1593
Swap:         1982         16       1966
./overcommit -m 1000
$ ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
1028160 1024404 6728 ./overcommit -m 1000
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    2170544 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1823        116          0         56        419
-/+ buffers/cache:       1347        592
Swap:         1982         16       1966
1.3.1.2. Allocate but don't use the memory
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1145828 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        821       1117          0         56        419
-/+ buffers/cache:        345       1594
Swap:         1982         16       1966
./overcommit 1000
$ ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
1028160  356  7380 ./overcommit 1000
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    2170132 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        821       1117          0         56        419
-/+ buffers/cache:        345       1594
Swap:         1982         16       1966
1.3.1.3. Allocate more than the physical memory
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1143952 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        830       1109          0         57        420
-/+ buffers/cache:        352       1587
Swap:         1982         16       1966
./overcommit 2900 &
./overcommit 2900 &
./overcommit 2900 &
$ ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
2973760  356  7941 ./overcommit 2900
2973760  356  7942 ./overcommit 2900
2973760  356  7943 ./overcommit 2900
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:   10053432 kB
             total       used       free     shared    buffers     cached
Mem:          1939        830       1109          0         57        420
-/+ buffers/cache:        352       1587
Swap:         1982         16       1966
$ sar -r
14:02:01    kbmemfree kbmemused  %memused kbbuffers  kbcached  kbcommit   %commit  kbactive   kbinact
14:03:01      1135508    850652     42,83     58984    430348   1145256     28,51    362912    364516
14:04:01      1134036    852124     42,90     59032    430412   1145320     28,51    364476    364584
14:05:01      1134460    851700     42,88     59108    430292   1145192     28,51    364272    364772
14:06:01      1135304    850856     42,84     59196    430292  10054664    250,32    363160    364884
14:07:01      1134660    851500     42,87     59256    430312  10054680    250,32    363236    364980
14:08:01      1137496    848664     42,73     59360    430296  10054600    250,32    360792    364796
Average:      1142628    843532     42,47     40850    300155   1811622     45,10    361707    352409
1.3.1.4. Out of memory killer
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        581       1358          0          4         68
-/+ buffers/cache:        509       1430
Swap:         1982          0       1982
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1444952 kB
./overcommit -m 3000
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1868         71          0          4         74
-/+ buffers/cache:       1789        150
Swap:         1982       1768        214
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    4497040 kB
./overcommit -m 2000
$ ./overcommit -m 3000
Killed
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1867         71          0          0         47
-/+ buffers/cache:       1819        120
Swap:         1982        801       1181
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    3473256 kB
Oct 13 18:04:24 voyager kernel: [158113.978944] Out of memory: Kill process 31659 (overcommit) score 766 or sacrifice child
Oct 13 18:04:24 voyager kernel: [158113.978947] Killed process 31659 (overcommit) total-vm:3076160kB, anon-rss:1266632kB, file-rss:364kB

1.3.2. Always overcommit

Set the overcommit_memory tunable to 1.

sysctl -w vm.overcommit_memory=1

When this flag is 1, the kernel pretends there is always enough memory until it actually runs out.

$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1331804 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        575       1363          0          3         83
-/+ buffers/cache:        489       1450
Swap:         1982          0       1982

Let's allocate 95Tb of memory:

./overcommit 100000000
$ ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
102400004160 356 19884 ./overcommit 100000000
$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:   102401332116 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        576       1362          0          3         83
-/+ buffers/cache:        490       1449
Swap:         1982          0       1982
$ sar -r
21:00:02    kbmemfree kbmemused  %memused kbbuffers  kbcached  kbcommit   %commit  kbactive   kbinact
21:01:02      1558000    428160     21.56      1480     63636   1341020     62.27    136816    172588
21:02:01      1402080    584080     29.41      2520     83792   1338016     33.31    164544    301580
21:03:01      1396800    589360     29.67      2552     88672 615741392   15329.35    165416    305964
21:04:01      1398452    587708     29.59      3312     84956   1333036     33.19    166904    302732
21:05:01      1396364    589796     29.70      3328     85044   1333036     33.19    169356    302828
21:06:01      1391520    594640     29.94      3644     86600 102401333260   2549359.16    172152    304568
21:07:01      1386324    599836     30.20     10256    100260 102401331020   2549359.11    162880    319300
21:08:01      1382360    603800     30.40     10448    100420 102401331660   2549359.12    166580    319156
Average:       537358   1448802     72.94    127415    279140 555181698   13821.67    797146    501741

1.3.3. Don't overcommit

Set the overcommit_memory tunable to 2.

sysctl -w vm.overcommit_memory=2

When this flag is 2, the kernel uses a "never overcommit" policy that attempts to prevent any overcommit of memory. When overcommit_memory is set to 2, the total committed address space of the system is not permitted to exceed swap plus the vm.overcommit_ratio percentage of physical RAM, ie, the CommitLimit of /proc/meminfo.

$ grep ^Commit /proc/meminfo
CommitLimit:     3023668 kB
Committed_AS:    1330612 kB
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        594       1344          0         10        100
-/+ buffers/cache:        484       1455
Swap:         1982          0       1982
./overcommit 2000
./overcommit: malloc 2097152000 failed (Cannot allocate memory)

As 1Gb of memory was already commited (Committed_AS), asking for 2 more Gb is not permitted (limit is CommitLimit).

1.4. Buffers and cache

The buffers remember what's in directories, what file permissions are, and keep track of what memory is being written from or read to for a particular block device. The cache only contains the contents of the files themselves.

1.4.1. Drop caches

Writing to this will cause the kernel to drop clean caches, dentries and
inodes from memory, causing that memory to become free.

To free pagecache:
        echo 1 > /proc/sys/vm/drop_caches
To free dentries and inodes:
        echo 2 > /proc/sys/vm/drop_caches
To free pagecache, dentries and inodes:
        echo 3 > /proc/sys/vm/drop_caches

As this is a non-destructive operation and dirty objects are not freeable, the
user should run `sync' first.
sync && echo 3 > /proc/sys/vm/drop_caches

1.4.2. Fill the buffers

1.4.2.1. With inode entries
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        559       1380          0          0         81
-/+ buffers/cache:        477       1462
Swap:         1982          0       1982
find /usr -exec ls -l {} \; > /dev/null 2>&1
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        846       1092          0        147         92
-/+ buffers/cache:        606       1333
Swap:         1982          0       1982
1.4.2.2. With raw device entries
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        552       1386          0          0         77
-/+ buffers/cache:        474       1465
Swap:         1982          0       1982
dd if=/dev/sda of=/dev/null
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1806        133          0       1219         77
-/+ buffers/cache:        509       1430
Swap:         1982          0       1982

1.4.3. Fill the cache

$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        557       1381          0          0         78
-/+ buffers/cache:        479       1460
Swap:         1982          0       1982
dd if=/dev/zero of=/tmp/dd.tmp bs=1024 count=1048576
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1619        319          0          2       1103
-/+ buffers/cache:        513       1425
Swap:         1982          0       1982

1.4.4. The cache effect on file reading

$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939        553       1386          0          0         78
-/+ buffers/cache:        474       1465
Swap:         1982          0       1982
$ dd if=/tmp/dd.tmp of=/dev/null
2097152+0 records in
2097152+0 records out
1073741824 bytes (1.1 GB) copied, 31.8524 s, 33.7 MB/s
$ free -m
             total       used       free     shared    buffers     cached
Mem:          1939       1591        348          0          1       1111
-/+ buffers/cache:        477       1461
Swap:         1982          0       1982
$ dd if=/tmp/dd.tmp of=/dev/null
2097152+0 records in
2097152+0 records out
1073741824 bytes (1.1 GB) copied, 1.61037 s, 667 MB/s

1.5. The difference among VIRT, RES, and SHR in top output

from http://www.linuxhowtos.org/System/Linux Memory Management.htm

VIRT stands for the virtual size of a process, which is the sum of memory it is actually using, memory it has mapped into itself (for instance the video card's RAM for the X server), files on disk that have been mapped into it (most notably shared libraries), and memory shared with other processes. VIRT represents how much memory the program is able to access at the present moment. RES stands for the resident size, which is an accurate representation of how much actual physical memory a process is consuming. (This also corresponds directly to the %MEM column.) This will virtually always be less than the VIRT size, since most programs depend on the C library. SHR indicates how much of the VIRT size is actually sharable memory or libraries). In the case of libraries, it does not necessarily mean that the entire library is resident. For example, if a program only uses a few functions in a library, the whole library is mapped and will be counted in VIRT and SHR, but only the parts of the library file containing the functions being used will actually be loaded in and be counted under RES.

1.5.1. VmData

./overcommit 2000
$ ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
2052160  356  6179 ./overcommit 2000
$ grep ^Vm /proc/6179/status
VmPeak:  2052160 kB
VmSize:  2052160 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:       356 kB
VmRSS:       356 kB
VmData:  2048048 kB
VmStk:       136 kB
VmExe:         4 kB
VmLib:      1884 kB
VmPTE:        32 kB
VmSwap:        0 kB
 VmPeak                      peak virtual memory size
 VmSize                      total program size
 VmLck                       locked memory size
 VmHWM                       peak resident set size ("high water mark")
 VmRSS                       size of memory portions
 VmData                      size of data, stack, and text segments
 VmStk                       size of data, stack, and text segments
 VmExe                       size of text segment
 VmLib                       size of shared library code
 VmPTE                       size of page table entries
 VmSwap                      size of swap usage (the number of referred swapents)

1.5.2. VmRSS

./overcommit -m 1000
# ps -eo vsz,rss,pid,args | grep 'overcommi[t]'
1028160 1024404 6532 ./overcommit -m 1000
# grep ^Vm /proc/6532/status                   
VmPeak:  1028160 kB
VmSize:  1028160 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:   1024404 kB
VmRSS:   1024404 kB
VmData:  1024048 kB
VmStk:       136 kB
VmExe:         4 kB
VmLib:      1884 kB
VmPTE:      2028 kB
VmSwap:        0 kB

1.5.3. VmStk

stack.c:

#define SIZE 2147483648

main ()
{
  char   a[SIZE];
  while (1) {
    sleep (1);
  }
}
gcc -o stack stack.c
ulimit -s unlimited
./stack
$ ps -eo vsz,rss,pid,args | grep 'stac[k]$'
2101176  348 16403 ./stack
$ grep ^Vm /proc/16403/status
VmPeak:  2101176 kB
VmSize:  2101176 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:       348 kB
VmRSS:       348 kB
VmData:       40 kB
VmStk:   2097160 kB
VmExe:         4 kB
VmLib:      1884 kB
VmPTE:        28 kB
VmSwap:        0 kB