summaryrefslogtreecommitdiff
path: root/cil/ocamlutil/perfcount.c.in
blob: ae532f697165c2c986436cbe56f9ac9a479b32ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// -*- Mode: c -*-
//
/*
 * A module that allows the reading of performance counters on Pentium.
 *
 * This file contains both code that uses the performance counters to 
 * compute the number of cycles per second (to be used during ./configure) 
 * and also code to read the performance counters from Ocaml.
 *
 * Author: George Necula (necula@cs.berkeley.edu)
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if defined(__GNUC__)
  #define longlong long long
  // RDTSC puts the result in EAX and EDX. We tell gcc to use those registers
  // for "low" and "high"
  #if defined(__i386__)
    #define GETCOUNTER(low,high) \
            __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
  #else
    #define GETCOUNTER(low,high) \
            printf ("Reading of performance counters is supported only on Intel x86\n"); \
            exit(1);
  #endif
#else
  // Microsoft Visual Studio
  #define longlong __int64
  #define inline   __inline
  #define GETCOUNTER(low,high) __asm { \
    __asm rdtsc \
    __asm mov low, eax \
    __asm mov high, edx };
#endif

/* Store here the first value read from the performance counter */
unsigned static longlong first_value;


/* This is the function that actually reads the performance counter. */
inline unsigned longlong read_ppc(void) {
  unsigned long pclow, pchigh;
  unsigned longlong lowhigh;

  GETCOUNTER(pclow, pchigh);
  
  // printf ("Read low=0x%08lx high=0x%08lx\n", low, high);

  // Put the 64-bit value together
  lowhigh = ((unsigned longlong)pclow) | ((unsigned longlong)pchigh << 32);

  if(first_value == 0) {
    first_value = lowhigh;
  }
  return lowhigh - first_value;
}


/* sm: I want a version that is as fast as possible, dropping
 * bits that aren't very important to achieve it. *
 *
 * This version drops the low 20 bits and the high 14 bits so the
 * result is 30 bits (always a positive Ocaml int); this yields
 * megacycles, which for GHz machines will be something like
 * milliseconds. */
static unsigned long sample_ppc_20(void)
{
  unsigned long pclow, pchigh;

  GETCOUNTER(pclow, pchigh);

  return ((pclow >> 20) | (pchigh << 12)) & 0x3FFFFFFF;
}

/* This version drops the low 10 bits, yielding something like
 * microseconds. */
inline static unsigned long sample_ppc_10()
{
  unsigned long pclow, pchigh;

  GETCOUNTER(pclow,pchigh);
    
  return ((pclow >> 10) | (pchigh << 22)) & 0x3FFFFFFF;
}



#ifndef CONFIGURATION_ONLY
/*** This is the OCAML stub for the read_ppc ***/
#include <caml/mlvalues.h>
#include <caml/alloc.h>
#include <caml/memory.h>

#define CYCLES_PER_USEC @CYCLES_PER_USEC@
value read_pentium_perfcount()
{
  double counter = (double)read_ppc() / (1000000.0 * CYCLES_PER_USEC);
  return copy_double(counter);
}

/* The Ocaml system can use this function to figure out if there are
 * performance counters available */
value has_performance_counters() {
  // HAS_PERFCOUNT is set by the configuration code at the end of
  // this file, during ./configure
#if @HAS_PERFCOUNT@ != 0
  return Val_true;
#else
  return Val_false;
#endif
}

/* sm: interface to above from Ocaml */
value sample_pentium_perfcount_20()
{
  return Val_long(sample_ppc_20());
}

value sample_pentium_perfcount_10()
{
  return Val_long(sample_ppc_10());
}

#endif


/* Now we have a function that tries to compute the number of cycles per 
 * second (to be used during ./configure) */
#ifdef CONFIGURATION_ONLY
#include <sys/times.h>
#include <unistd.h>
#include <math.h>

int main() {
  struct tms t;
  clock_t start, finish, diff;
  unsigned longlong start_pc, finish_pc, diff_pc;
  long clk_per_sec = sysconf(_SC_CLK_TCK);
  double cycles_per_usec;
  
  if(clk_per_sec <= 0) {
    printf("Cannot find clk_per_sec (got %ld)\n", clk_per_sec);
    exit(1);
  }
  
  times(&t); start = t.tms_utime;
  start_pc = read_ppc();
  // Do something for a while
  {
    int i;
    double a = 5.678;
    for(i=0;i<10000000;i++) {
      a = (i & 1) ? (a * a) : (sqrt(a));
    }
  }
  times(&t); finish = t.tms_utime;
  finish_pc = read_ppc();
  diff = finish - start;
  diff_pc = finish_pc - start_pc;
  if(diff == 0) {
    printf("Cannot use Unix.times\n");
    exit(1);
  }
  if(diff_pc == 0) {
    printf("Invalid result from the peformance counters\n");
    exit(1);
  }
  diff_pc /= 1000000; // We care about cycles per microsecond
//  printf("diff = %ld, diff_pc = %ld, clk = %ld\n",
//         (long)diff,
//         (long)diff_pc, (long)clk_per_sec);

  cycles_per_usec = (((double)diff_pc / (double)diff)
                               * (double)clk_per_sec);

  /* Whatever value we print here will be used as the CYCLES_PER_USEC
   * below */
  printf("%.3lf\n", cycles_per_usec);
  exit(0);
}
#endif //defined CONFIGURATION_ONLY