|
| 1 | +commit 1ae2a07024618b042dc24fd5858ac2537f3d0d9d |
| 2 | +Author: Keno Fischer < [email protected]> |
| 3 | +Date: Sat Aug 19 01:40:18 2023 +0000 |
| 4 | + |
| 5 | + Use patchable rdtsc sequence to avoid slowdowns under rr |
| 6 | + |
| 7 | + We (Julia) ship both support for using tracy to trace julia applications, |
| 8 | + as well as using `rr` (https://github.com/rr-debugger/rr) for record-replay debugging. |
| 9 | + After our most recent rebuild of tracy, users have been reporting signfificant performance |
| 10 | + slowdowns when `rr` recording a session that happens to also load the tracy library |
| 11 | + (even if tracing is not enabled). Upon further examination, the recompile happened |
| 12 | + to trigger a protective heuristic that disabled rr's patching of tracy's use of |
| 13 | + `rdtsc` because an earlier part of the same function happened to look like a |
| 14 | + conditional branch into the patch region. See https://github.com/rr-debugger/rr/pull/3580 |
| 15 | + for details. To avoid this issue occurring again in future rebuilds of tracy, |
| 16 | + adjust tracy's `rdtsc` sequence to be `nopl; rdtsc`, which (as of of the |
| 17 | + linked PR) is a sequence that is guaranteed to bypass this heuristic |
| 18 | + and not incur the additional overhead when run under rr. |
| 19 | + |
| 20 | +diff --git a/public/client/TracyProfiler.hpp b/public/client/TracyProfiler.hpp |
| 21 | +index 1b825ea3..eea8c32d 100644 |
| 22 | +--- a/public/client/TracyProfiler.hpp |
| 23 | ++++ b/public/client/TracyProfiler.hpp |
| 24 | +@@ -206,10 +206,24 @@ public: |
| 25 | + return ( uint64_t( edx ) << 32 ) + uint64_t( eax ); |
| 26 | + } |
| 27 | + # elif defined __x86_64__ || defined _M_X64 |
| 28 | +- if( HardwareSupportsInvariantTSC() ) |
| 29 | ++ |
| 30 | ++#define NOP5_OVERRIDE_NOP |
| 31 | ++ uint64_t low, high; |
| 32 | ++ if( HardwareSupportsInvariantTSC() ) |
| 33 | + { |
| 34 | + uint64_t rax, rdx; |
| 35 | +- asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); |
| 36 | ++ // Some external tooling (such as rr) wants to patch our rdtsc and replace it by a |
| 37 | ++ // branch to control the external input seen by a program. This kind of patching is |
| 38 | ++ // not generally possible depending on the surrounding code and can lead to significant |
| 39 | ++ // slowdowns if the compiler generated unlucky code and rr and tracy are used together. |
| 40 | ++ // To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence, |
| 41 | ++ // which rr promises will be patchable independent of the surrounding code. |
| 42 | ++ asm volatile ( |
| 43 | ++ // This is nopl 0(%rax, %rax, 1), but assembler are incosistent about whether |
| 44 | ++ // they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use |
| 45 | ++ // the 5 byte one. |
| 46 | ++ ".byte 0x01, 0x1f, 0x44, 0x00, 0x00\n\t" |
| 47 | ++ "rdtsc" : "=a" (rax), "=d" (rdx) ); |
| 48 | + return (int64_t)(( rdx << 32 ) + rax); |
| 49 | + } |
| 50 | + # else |
0 commit comments