cpython/Python/perf_jit_trampoline.c at main · python/cpython · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
/*
 * Python Perf Trampoline Support - JIT Dump Implementation
 *
 * This file implements the perf jitdump API for Python's performance profiling
 * integration. It allows perf (Linux performance analysis tool) to understand
 * and profile dynamically generated Python bytecode by creating JIT dump files
 * that perf can inject into its analysis.
 *
 *
 * IMPORTANT: This file exports specific callback functions that are part of
 * Python's internal API. Do not modify the function signatures or behavior
 * of exported functions without coordinating with the Python core team.
 *
 * Usually the binary and libraries are mapped in separate region like below:
 *
 *   address ->
 *    --+---------------------+--//--+---------------------+--
 *      | .text | .data | ... |      | .text | .data | ... |
 *    --+---------------------+--//--+---------------------+--
 *          myprog                      libc.so
 *
 * So it'd be easy and straight-forward to find a mapped binary or library from an
 * address.
 *
 * But for JIT code, the code arena only cares about the code section. But the
 * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
 * unwind info too. Then it'd generate following address space with synthesized
 * MMAP events. Let's say it has a sample between address B and C.
 *
 *                                                sample
 *                                                  |
 *   address ->                         A       B   v   C
 *   ---------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
 *     ...
 *   ---------------------------------------------------------------------------------------------------
 *
 * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
 * the unwind info. If it maps both .text section and unwind sections, the sample
 * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
 * which one is right. So to make perf happy we have non-overlapping ranges for each
 * DSO:
 *
 *   address ->
 *   -------------------------------------------------------------------------------------------------------
 *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
 *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
 *     ...
 *   -------------------------------------------------------------------------------------------------------
 *
 * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
 * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
 */


#include "Python.h"
#include "pycore_ceval.h"         // _PyPerf_Callbacks
#include "pycore_frame.h"
#include "pycore_interp.h"
#include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
#include "pycore_jit_unwind.h"
#include "pycore_runtime.h"       // _PyRuntime

#ifdef PY_HAVE_PERF_TRAMPOLINE

/* Standard library includes for perf jitdump implementation */
#if defined(__linux__)
#  include <elf.h>                // ELF architecture constants
#endif
#include <fcntl.h>                // File control operations
#include <stdio.h>                // Standard I/O operations
#include <stdlib.h>               // Standard library functions
#include <string.h>               // memcpy, strlen
#include <sys/mman.h>             // Memory mapping functions (mmap)
#include <sys/types.h>            // System data types
#include <unistd.h>               // System calls (sysconf, getpid)
#include <sys/time.h>             // Time functions (gettimeofday)
#if defined(__linux__)
#  include <sys/syscall.h>        // System call interface
#endif

// =============================================================================
//                           CONSTANTS AND CONFIGURATION
// =============================================================================

/*
 * Memory layout considerations for perf jitdump:
 *
 * Perf expects non-overlapping memory regions for each JIT-compiled function.
 * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
 * Shared Object) files that contain:
 * - ELF headers
 * - .text section (actual machine code)
 * - Unwind information (for stack traces)
 *
 * To ensure proper address space layout, we add padding between code regions.
 * This prevents address conflicts when perf maps the synthesized DSOs.
 *
 * Memory layout example:
 * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
 * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
 *
 * The padding size is now calculated automatically during initialization
 * based on the actual unwind information requirements.
 */


/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
#if !defined(__linux__)
#  if defined(__i386__) || defined(_M_IX86)
#    define EM_386      3
#  elif defined(__arm__) || defined(_M_ARM)
#    define EM_ARM      40
#  elif defined(__x86_64__) || defined(_M_X64)
#    define EM_X86_64   62
#  elif defined(__aarch64__)
#    define EM_AARCH64  183
#  elif defined(__riscv)
#    define EM_RISCV    243
#  endif
#endif

/* Convenient access to the global trampoline API state */
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api

/* Type aliases for clarity and portability */
typedef uint64_t uword;                    // Word-sized unsigned integer
typedef const char* CodeComments;          // Code comment strings

/* Memory size constants */
#define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing

// =============================================================================
//                        ARCHITECTURE-SPECIFIC DEFINITIONS
// =============================================================================

/*
 * Returns the ELF machine architecture constant for the current platform.
 * This is required for the jitdump header to correctly identify the target
 * architecture for perf processing.
 *
 */
static uint64_t GetElfMachineArchitecture(void) {
#if defined(__x86_64__) || defined(_M_X64)
    return EM_X86_64;
#elif defined(__i386__) || defined(_M_IX86)
    return EM_386;
#elif defined(__aarch64__)
    return EM_AARCH64;
#elif defined(__arm__) || defined(_M_ARM)
    return EM_ARM;
#elif defined(__riscv)
    return EM_RISCV;
#else
    Py_UNREACHABLE();  // Unsupported architecture - should never reach here
    return 0;
#endif
}

// =============================================================================
//                           PERF JITDUMP DATA STRUCTURES
// =============================================================================

/*
 * Perf jitdump file format structures
 *
 * These structures define the binary format that perf expects for JIT dump files.
 * The format is documented in the Linux perf tools source code and must match
 * exactly for proper perf integration.
 */

/*
 * Jitdump file header - written once at the beginning of each jitdump file
 * Contains metadata about the process and jitdump format version
 */
typedef struct {
    uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
    uint32_t version;            // Jitdump format version (currently 1)
    uint32_t size;               // Size of this header structure
    uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
    uint32_t reserved;           // Reserved field (must be 0)
    uint32_t process_id;         // Process ID of the JIT compiler
    uint64_t time_stamp;         // Timestamp when jitdump was created
    uint64_t flags;              // Feature flags (currently unused)
} Header;

/*
 * Perf event types supported by the jitdump format
 * Each event type has a corresponding structure format
 */
enum PerfEvent {
    PerfLoad = 0,           // Code load event (new JIT function)
    PerfMove = 1,           // Code move event (function relocated)
    PerfDebugInfo = 2,      // Debug information event
    PerfClose = 3,          // JIT session close event
    PerfUnwindingInfo = 4   // Stack unwinding information event
};

/*
 * Base event structure - common header for all perf events
 * Every event in the jitdump file starts with this structure
 */
struct BaseEvent {
    uint32_t event;         // Event type (from PerfEvent enum)
    uint32_t size;          // Total size of this event including payload
    uint64_t time_stamp;    // Timestamp when event occurred
};

/*
 * Code load event - indicates a new JIT-compiled function is available
 * This is the most important event type for Python profiling
 */
typedef struct {
    struct BaseEvent base;   // Common event header
    uint32_t process_id;     // Process ID where code was generated
#if defined(__APPLE__)
    uint64_t thread_id;      // Thread ID where code was generated
#else
    uint32_t thread_id;      // Thread ID where code was generated
#endif
    uint64_t vma;            // Virtual memory address where code is loaded
    uint64_t code_address;   // Address of the actual machine code
    uint64_t code_size;      // Size of the machine code in bytes
    uint64_t code_id;        // Unique identifier for this code region
    /* Followed by:
     * - null-terminated function name string
     * - raw machine code bytes
     */
} CodeLoadEvent;

/*
 * Code unwinding information event - provides DWARF data for stack traces
 * Essential for proper stack unwinding during profiling
 */
typedef struct {
    struct BaseEvent base;      // Common event header
    uint64_t unwind_data_size;  // Size of the unwinding data
    uint64_t eh_frame_hdr_size; // Size of the EH frame header
    uint64_t mapped_size;       // Total mapped size (with padding)
    /* Followed by:
     * - EH frame header
     * - DWARF unwinding information
     * - Padding to alignment boundary
     */
} CodeUnwindingInfoEvent;

/*
 * EH Frame Header structure for DWARF unwinding
 *
 * This header provides metadata about the .eh_frame data that follows.
 * It uses PC-relative and data-relative encodings to keep the synthesized
 * DSO self-contained when perf injects it.
 */
typedef struct __attribute__((packed)) {
    uint8_t version;
    uint8_t eh_frame_ptr_enc;
    uint8_t fde_count_enc;
    uint8_t table_enc;
    int32_t eh_frame_ptr;
    uint32_t eh_fde_count;
    int32_t from;
    int32_t to;
} EhFrameHeader;
_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");

// =============================================================================
//                              GLOBAL STATE MANAGEMENT
// =============================================================================

/*
 * Global state for the perf jitdump implementation
 *
 * This structure maintains all the state needed for generating jitdump files.
 * It's designed as a singleton since there's typically only one jitdump file
 * per Python process.
 */
typedef struct {
    FILE* perf_map;          // File handle for the jitdump file
    PyMutex map_lock;        // Thread synchronization lock
    void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
    size_t mapped_size;      // Size of the mapped region
    uint32_t code_id;        // Counter for unique code region identifiers
    uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
} PerfMapJitState;

/* Global singleton instance */
static PerfMapJitState perf_jit_map_state;

// =============================================================================
//                              TIME UTILITIES
// =============================================================================

/* Time conversion constant */
static const intptr_t nanoseconds_per_second = 1000000000;

/*
 * Get current monotonic time in nanoseconds
 *
 * Monotonic time is preferred for event timestamps because it's not affected
 * by system clock adjustments. This ensures consistent timing relationships
 * between events even if the system clock is changed.
 *
 * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
 */
static int64_t get_current_monotonic_ticks(void) {
    struct timespec ts;
    if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }

    /* Convert to nanoseconds for maximum precision */
    int64_t result = ts.tv_sec;
    result *= nanoseconds_per_second;
    result += ts.tv_nsec;
    return result;
}

/*
 * Get current wall clock time in microseconds
 *
 * Used for the jitdump file header timestamp. Unlike monotonic time,
 * this represents actual wall clock time that can be correlated with
 * other system events.
 *
 * Returns: Current time in microseconds since Unix epoch
 */
static int64_t get_current_time_microseconds(void) {
    struct timeval tv;
    if (gettimeofday(&tv, NULL) < 0) {
        Py_UNREACHABLE();  // Should never fail on supported systems
        return 0;
    }
    return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
}

// =============================================================================
//                              FILE I/O UTILITIES
// =============================================================================

/*
 * Write data to the jitdump file with error handling
 *
 * This function ensures that all data is written to the file, handling
 * partial writes that can occur with large buffers or when the system
 * is under load.
 *
 * Args:
 *   buffer: Pointer to data to write
 *   size: Number of bytes to write
 */
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
    FILE* out_file = perf_jit_map_state.perf_map;
    const char* ptr = (const char*)(buffer);

    while (size > 0) {
        const size_t written = fwrite(ptr, 1, size, out_file);
        if (written == 0) {
            Py_UNREACHABLE();  // Write failure - should be very rare
            break;
        }
        size -= written;
        ptr += written;
    }
}

/*
 * Write the jitdump file header
 *
 * The header must be written exactly once at the beginning of each jitdump
 * file. It provides metadata that perf uses to parse the rest of the file.
 *
 * Args:
 *   pid: Process ID to include in the header
 *   out_file: File handle to write to (currently unused, uses global state)
 */
static void perf_map_jit_write_header(int pid, FILE* out_file) {
    Header header;

    /* Initialize header with required values */
    header.magic = 0x4A695444;                    // "JiTD" magic number
    header.version = 1;                           // Current jitdump version
    header.size = sizeof(Header);                 // Header size for validation
    header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
    header.reserved = 0;                          // padding reserved for future use
    header.process_id = pid;                      // Process identifier
    header.time_stamp = get_current_time_microseconds();   // Creation time
    header.flags = 0;                             // No special flags currently used

    perf_map_jit_write_fully(&header, sizeof(header));
}

// =============================================================================
//                              JITDUMP INITIALIZATION
// =============================================================================

/*
 * Initialize the perf jitdump interface
 *
 * This function sets up everything needed to generate jitdump files:
 * 1. Creates the jitdump file with a unique name
 * 2. Maps the first page to signal perf that we're using the interface
 * 3. Writes the jitdump header
 * 4. Initializes synchronization primitives
 *
 * The memory mapping is crucial - perf detects jitdump files by scanning
 * for processes that have mapped files matching the pattern /tmp/jit-*.dump
 *
 * Returns: Pointer to initialized state, or NULL on failure
 */
static void* perf_map_jit_init(void) {
    PyMutex_Lock(&perf_jit_map_state.map_lock);
    if (perf_jit_map_state.perf_map != NULL) {
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return &perf_jit_map_state;
    }

    char filename[100];
    int pid = getpid();

    /* Create unique filename based on process ID */
    snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);

    /* Create/open the jitdump file with appropriate permissions */
    const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
    if (fd == -1) {
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to create file
    }

    /* Get system page size for memory mapping */
    const long page_size = sysconf(_SC_PAGESIZE);
    if (page_size == -1) {
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to get page size
    }

#if defined(__APPLE__)
    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
    perf_jit_map_state.mapped_buffer = NULL;
#else
    /*
     * Map the first page of the jitdump file
     *
     * This memory mapping serves as a signal to perf that this process
     * is generating JIT code. Perf scans /proc/.../maps looking for mapped
     * files that match the jitdump naming pattern.
     *
     * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
     */
    perf_jit_map_state.mapped_buffer = mmap(
        NULL,                    // Let kernel choose address
        page_size,               // Map one page
        PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
        MAP_PRIVATE,             // Private mapping
        fd,                      // File descriptor
        0                        // Offset 0 (first page)
    );

    if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
        perf_jit_map_state.mapped_buffer = NULL;
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Memory mapping failed
    }
    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
                               "cpython:perf_jit_trampoline");
#endif

    perf_jit_map_state.mapped_size = page_size;

    /* Convert file descriptor to FILE* for easier I/O operations */
    perf_jit_map_state.perf_map = fdopen(fd, "w+");
    if (perf_jit_map_state.perf_map == NULL) {
        close(fd);
        PyMutex_Unlock(&perf_jit_map_state.map_lock);
        return NULL;  // Failed to create FILE*
    }

    /*
     * Set up file buffering for better performance
     *
     * We use a large buffer (2MB) because jitdump files can be written
     * frequently during program execution. Buffering reduces system call
     * overhead and improves overall performance.
     */
    setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);

    /* Write the jitdump file header */
    perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

    /* Initialize code ID counter */
    perf_jit_map_state.code_id = 0;
    perf_jit_map_state.build_id_salt =
        ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();

    /* Calculate padding size based on actual unwind info requirements */
    size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
    size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
    trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
    trampoline_api.code_alignment = 32;

    PyMutex_Unlock(&perf_jit_map_state.map_lock);
    return &perf_jit_map_state;
}

// =============================================================================
//                              MAIN JITDUMP ENTRY WRITING
// =============================================================================

/*
 * Write a complete jitdump entry for a code region with a provided name.
 *
 * This shares the same implementation as the trampoline callback, but
 * allows callers that don't have a PyCodeObject to reuse the jitdump
 * infrastructure.
 */
static void perf_map_jit_write_entry_with_name(
    void *state,
    const void *code_addr,
    size_t code_size,
    const char *entry,
    const char *filename
)
{
    /* Initialize jitdump system on first use */
    void* ret = perf_map_jit_init();
    if (ret == NULL) {
        return;  // Initialization failed, silently abort
    }

    if (entry == NULL) {
        entry = "";
    }
    if (filename == NULL) {
        filename = "";
    }

    /*
     * Create formatted function name for perf display
     *
     * Format: "py::<function_name>:<filename>"
     * The "py::" prefix helps identify Python functions in mixed-language
     * profiles (e.g., when profiling C extensions alongside Python code).
     */
    size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
    char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
    if (perf_map_entry == NULL) {
        return;  // Memory allocation failed
    }
    snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

    const size_t name_length = strlen(perf_map_entry);
    uword base = (uword)code_addr;
    uword size = code_size;

    /*
     * Generate DWARF unwinding information
     *
     * DWARF data is essential for proper stack unwinding during profiling.
     * Without it, perf cannot generate accurate call graphs, especially
     * in optimized code where frame pointers may be omitted.
     */
    uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
    size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
        buffer, sizeof(buffer), code_addr, code_size, 0);
    if (eh_frame_size == 0) {
        PyMem_RawFree(perf_map_entry);
        return;
    }

    /*
     * A logical jitdump entry is written as multiple records and also consumes
     * a process-global code_id. Serialize the whole sequence so concurrent JIT
     * compilation cannot interleave records or reuse an ID.
     */
    PyMutex_Lock(&perf_jit_map_state.map_lock);

    /*
     * Write Code Unwinding Information Event
     *
     * This event must be written before the code load event to ensure
     * perf has the unwinding information available when it processes
     * the code region.
     */
    CodeUnwindingInfoEvent ev2;
    ev2.base.event = PerfUnwindingInfo;
    ev2.base.time_stamp = get_current_monotonic_ticks();
    ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;

    /* Verify we don't exceed our padding budget */
    assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);

    ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
    ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment

    /* Calculate total event size with padding */
    int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
    int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
    ev2.base.size = (uint32_t)(content_size + padding_size);

    /* Write the unwinding info event header */
    perf_map_jit_write_fully(&ev2, sizeof(ev2));

    /*
     * Write EH Frame Header
     *
     * The EH frame header provides metadata about the DWARF unwinding
     * information that follows. It includes pointers and counts that
     * help perf navigate the unwinding data efficiently.
     */
    EhFrameHeader f;
    f.version = 1;
    f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
    f.fde_count_enc = DWRF_EH_PE_udata4;
    f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;

    /* Calculate relative offsets for EH frame navigation */
    f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
    f.eh_fde_count = 1;  // We generate exactly one FDE per function
    f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
    uint32_t cie_payload_size;
    memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
    int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
    f.to = -(int32_t)(eh_frame_size - cie_size);

    /* Write EH frame data and header */
    perf_map_jit_write_fully(buffer, eh_frame_size);
    perf_map_jit_write_fully(&f, sizeof(f));

    /* Write padding to maintain alignment */
    char padding_bytes[] = "\0\0\0\0\0\0\0\0";
    perf_map_jit_write_fully(&padding_bytes, padding_size);

    /*
     * Write Code Load Event
     *
     * This event tells perf about the new code region. It includes:
     * - Memory addresses and sizes
     * - Process and thread identification
     * - Function name for symbol resolution
     * - The actual machine code bytes
     */
    CodeLoadEvent ev;
    ev.base.event = PerfLoad;
    ev.base.size = sizeof(ev) + (name_length+1) + size;
    ev.base.time_stamp = get_current_monotonic_ticks();
    ev.process_id = getpid();
#if defined(__APPLE__)
    pthread_threadid_np(NULL, &ev.thread_id);
#else
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
#endif
    ev.vma = base;                       // Virtual memory address
    ev.code_address = base;              // Same as VMA for our use case
    ev.code_size = size;

    /* Assign unique code ID and increment counter */
    perf_jit_map_state.code_id += 1;
    ev.code_id = perf_jit_map_state.code_id;

    /* Write code load event and associated data */
    perf_map_jit_write_fully(&ev, sizeof(ev));
    perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
    /*
     * Ensure each synthetic DSO has unique .text bytes.
     *
     * perf merges DSOs that share a build-id. Since trampolines can share
     * identical code and unwind bytes, perf may resolve all JIT frames to
     * the first symbol it saw (including entries from previous runs when
     * build-id caching is enabled). Patch a small marker in the emitted
     * bytes to make the build-id depend on a per-process salt and code id
     * without modifying the live code.
     */
    uint64_t marker = perf_jit_map_state.build_id_salt ^
        ((uint64_t)perf_jit_map_state.code_id << 32) ^
        (uint64_t)code_size;
    if (size >= sizeof(marker)) {
        size_t prefix = size - sizeof(marker);
        perf_map_jit_write_fully((void *)(base), prefix);
        perf_map_jit_write_fully(&marker, sizeof(marker));
    }
    else if (size > 0) {
        uint8_t tmp[sizeof(marker)];
        memcpy(tmp, (void *)(base), size);
        for (size_t i = 0; i < size; i++) {
            tmp[i] ^= (uint8_t)(marker >> (i * 8));
        }
        perf_map_jit_write_fully(tmp, size);
    }

    /* Clean up allocated memory */
    PyMutex_Unlock(&perf_jit_map_state.map_lock);
    PyMem_RawFree(perf_map_entry);
}

/*
 * Write a complete jitdump entry for a Python function
 *
 * This is the main function called by Python's trampoline system whenever
 * a new piece of JIT-compiled code needs to be recorded. It writes both
 * the unwinding information and the code load event to the jitdump file.
 *
 * The function performs these steps:
 * 1. Initialize jitdump system if not already done
 * 2. Extract function name and filename from Python code object
 * 3. Generate DWARF unwinding information
 * 4. Write unwinding info event to jitdump file
 * 5. Write code load event to jitdump file
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *   code_addr: Address where the compiled code resides
 *   code_size: Size of the compiled code in bytes
 *   co: Python code object containing metadata
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static void perf_map_jit_write_entry(void *state, const void *code_addr,
                                     size_t code_size, PyCodeObject *co)
{
    const char *entry = "";
    const char *filename = "";
    if (co != NULL) {
        if (co->co_qualname != NULL) {
            entry = PyUnicode_AsUTF8(co->co_qualname);
        }
        if (co->co_filename != NULL) {
            filename = PyUnicode_AsUTF8(co->co_filename);
        }
    }
    perf_map_jit_write_entry_with_name(state, code_addr, code_size,
                                       entry, filename);
}

void
_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
                          const char *entry, const char *filename)
{
    perf_map_jit_write_entry_with_name(
        NULL, code_addr, code_size, entry, filename);
}

// =============================================================================
//                              CLEANUP AND FINALIZATION
// =============================================================================

/*
 * Finalize and cleanup the perf jitdump system
 *
 * This function is called when Python is shutting down or when the
 * perf trampoline system is being disabled. It ensures all resources
 * are properly released and all buffered data is flushed to disk.
 *
 * Args:
 *   state: Jitdump state (currently unused, uses global state)
 *
 * Returns: 0 on success
 *
 * IMPORTANT: This function signature is part of Python's internal API
 * and must not be changed without coordinating with core Python development.
 */
static int perf_map_jit_fini(void* state) {
    /*
     * Close jitdump file with proper synchronization
     *
     * We need to acquire the lock to ensure no other threads are
     * writing to the file when we close it. This prevents corruption
     * and ensures all data is properly flushed.
     */
    PyMutex_Lock(&perf_jit_map_state.map_lock);
    if (perf_jit_map_state.perf_map != NULL) {
        fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
        perf_jit_map_state.perf_map = NULL;
    }
    PyMutex_Unlock(&perf_jit_map_state.map_lock);

    /*
     * Unmap the memory region
     *
     * This removes the signal to perf that we were generating JIT code.
     * After this point, perf will no longer detect this process as
     * having JIT capabilities.
     */
    if (perf_jit_map_state.mapped_buffer != NULL) {
        munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
        perf_jit_map_state.mapped_buffer = NULL;
    }

    /* Clear global state reference */
    trampoline_api.state = NULL;

    return 0;  // Success
}

// =============================================================================
//                              PUBLIC API EXPORT
// =============================================================================

/*
 * Python Perf Callbacks Structure
 *
 * This structure defines the callback interface that Python's trampoline
 * system uses to integrate with perf profiling. It contains function
 * pointers for initialization, event writing, and cleanup.
 *
 * CRITICAL: This structure and its contents are part of Python's internal
 * API. The function signatures and behavior must remain stable to maintain
 * compatibility with the Python interpreter's perf integration system.
 *
 * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
 */
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
    &perf_map_jit_init,        // Initialization function
    &perf_map_jit_write_entry, // Event writing function
    &perf_map_jit_fini,        // Cleanup function
};

#endif /* PY_HAVE_PERF_TRAMPOLINE */