trinity/objects.c at master · kernelslacker/trinity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include "arch.h"
#include "child.h"
#include "debug.h"
#include "deferred-free.h"
#include "fd.h"
#include "list.h"
#include "locks.h"
#include "objects.h"
#include "params.h"
#include "pc_format.h"
#include "pids.h"
#include "random.h"
#include "rnd.h"
#include "shm.h"
#include "stats_ring.h"
#include "trinity.h"
#include "utils.h"

static struct list_head global_obj_list = { &global_obj_list, &global_obj_list };

/*
 * Parent-private OBJ_GLOBAL pool.  Populated pre-fork by every
 * REG_GLOBAL_OBJ provider via add_object(OBJ_GLOBAL); the per-child
 * snapshot in clone_global_objects_to_child() reads this array.
 * Lives in the parent's data segment, fork-COW'd into children whose
 * resolver (get_objhead) routes around it in favour of their own
 * private copy.
 */
static struct objhead parent_global_objects[MAX_OBJECT_TYPES];

/*
 * Parent-private fd->object hash and parallel compact live-fd list.
 * Same shape as the per-child snapshots; fd_hash_insert / fd_hash_remove
 * mutate these from the parent's pre-fork init and post-fork fd-event
 * drains.  Children read their own snapshots; the parent reads these
 * directly when servicing remove_object_by_fd() out of fd_event_drain().
 */
static struct fd_hash_entry parent_fd_hash[FD_HASH_SIZE];
static int parent_fd_live[FD_LIVE_MAX];
static unsigned int parent_fd_hash_count;
static unsigned int parent_fd_live_count;

void register_global_obj_init(struct global_obj_entry *entry)
{
	list_add_tail((struct list_head *) &entry->list, &global_obj_list);
}

void init_global_objects(void)
{
	struct list_head *pos;

	list_for_each(pos, &global_obj_list) {
		struct global_obj_entry *entry = (struct global_obj_entry *) pos;

		output(1, "Initializing %s objects.\n", entry->name);
		entry->init();
	}
}

/*
 * Hash table mapping fd → (object, type) for O(1) lookup in the
 * parent's remove_object_by_fd().  Open-addressing with linear
 * probing.  The parent's view sits in parent_fd_hash[]; each child
 * holds an independent snapshot in child->fd_hash[] populated by
 * clone_global_objects_to_child().
 */

void fd_hash_init(void)
{
	unsigned int i;

	for (i = 0; i < FD_HASH_SIZE; i++) {
		parent_fd_hash[i].fd = -1;
		parent_fd_hash[i].gen = 0;
	}
	parent_fd_hash_count = 0;
	/*
	 * fd_live[] entries are gated by fd_live_count, so initialising
	 * just the count is sufficient; stale slot contents past the
	 * count are never read.
	 */
	parent_fd_live_count = 0;
}

/*
 * Append fd to the parent's parallel live-fd list.  Called from
 * fd_hash_insert() after transitioning a slot from empty to occupied.
 * Single-writer (the parent); no cross-process coherence required.
 * Silently drops the entry if the cap is hit; the auditor that reads
 * via the per-child snapshot tolerates a missed fd.
 */
static void fd_live_append(int fd)
{
	unsigned int idx = parent_fd_live_count;

	if (idx >= FD_LIVE_MAX)
		return;

	parent_fd_live[idx] = fd;
	parent_fd_live_count = idx + 1;
}

/*
 * Swap-remove fd from the parent's parallel live-fd list.  Linear scan
 * over parent_fd_live[0..count); typical occupancy is a few hundred
 * entries so the cost is negligible.
 */
static void fd_live_remove(int fd)
{
	unsigned int count = parent_fd_live_count;
	unsigned int i;

	for (i = 0; i < count; i++) {
		if (parent_fd_live[i] != fd)
			continue;

		if (i != count - 1)
			parent_fd_live[i] = parent_fd_live[count - 1];
		parent_fd_live_count = count - 1;
		return;
	}
}

static unsigned int fd_hash_slot(int fd)
{
	return (unsigned int) fd & (FD_HASH_SIZE - 1);
}

/*
 * Internal insert that preserves the entry's existing generation and
 * doesn't update fd_hash_count.  Used by fd_hash_remove to re-hash
 * displaced entries: the entry's identity is unchanged, only its slot.
 */
static void fd_hash_reinsert(int fd, struct object *obj, enum objecttype type,
			     uint32_t gen)
{
	unsigned int slot;
	unsigned int probe;

	slot = fd_hash_slot(fd);
	for (probe = 0; probe < FD_HASH_SIZE; probe++) {
		if (parent_fd_hash[slot].fd == -1)
			break;
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
	if (probe == FD_HASH_SIZE) {
		shm->stats.fd_hash_reinsert_dropped++;
		outputerr("fd_hash_reinsert: table full, dropping fd %d\n", fd);
		return;
	}

	parent_fd_hash[slot].obj = obj;
	parent_fd_hash[slot].type = type;
	parent_fd_hash[slot].gen = gen;
	parent_fd_hash[slot].fd = fd;
}

bool fd_hash_insert(int fd, struct object *obj, enum objecttype type)
{
	unsigned int slot;

	if (fd < 0)
		return true;

	if (parent_fd_hash_count >= FD_HASH_SIZE)
		return false;

	slot = fd_hash_slot(fd);
	while (parent_fd_hash[slot].fd != -1 && parent_fd_hash[slot].fd != fd)
		slot = (slot + 1) & (FD_HASH_SIZE - 1);

	if (parent_fd_hash[slot].fd == -1) {
		parent_fd_hash_count++;
		fd_live_append(fd);
	}

	parent_fd_hash[slot].obj = obj;
	parent_fd_hash[slot].type = type;
	parent_fd_hash[slot].gen++;
	parent_fd_hash[slot].fd = fd;
	return true;
}

void fd_hash_remove(int fd)
{
	unsigned int slot, next, i;

	if (fd < 0)
		return;

	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		if (parent_fd_hash[slot].fd == -1)
			return;
		if (parent_fd_hash[slot].fd == fd) {
			parent_fd_hash[slot].gen++;
			parent_fd_hash[slot].fd = -1;
			fd_live_remove(fd);
			next = (slot + 1) & (FD_HASH_SIZE - 1);
			while (parent_fd_hash[next].fd != -1) {
				struct fd_hash_entry displaced = parent_fd_hash[next];
				parent_fd_hash[next].fd = -1;
				fd_hash_reinsert(displaced.fd, displaced.obj,
						 displaced.type, displaced.gen);
				next = (next + 1) & (FD_HASH_SIZE - 1);
			}
			parent_fd_hash_count--;
			return;
		}
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
}

void fd_hash_remove_local(int fd)
{
	struct childdata *child;
	struct fd_hash_entry *table;
	unsigned int slot, next, i;

	if (fd < 0)
		return;

	if (mypid() == mainpid)
		return;

	child = this_child();
	if (child == NULL || child->fd_hash == NULL)
		return;

	table = child->fd_hash;
	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		if (table[slot].fd == -1)
			return;
		if (table[slot].fd == fd) {
			table[slot].gen++;
			table[slot].fd = -1;
			next = (slot + 1) & (FD_HASH_SIZE - 1);
			while (table[next].fd != -1) {
				struct fd_hash_entry displaced = table[next];
				unsigned int rs;

				table[next].fd = -1;
				rs = fd_hash_slot(displaced.fd);
				while (table[rs].fd != -1 &&
				       table[rs].fd != displaced.fd)
					rs = (rs + 1) & (FD_HASH_SIZE - 1);
				table[rs] = displaced;
				next = (next + 1) & (FD_HASH_SIZE - 1);
			}
			return;
		}
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
}

struct fd_hash_entry *fd_hash_lookup(int fd)
{
	struct fd_hash_entry *table;
	unsigned int slot, i;

	if (fd < 0)
		return NULL;

	/*
	 * Children resolve against their fork-time snapshot of the
	 * parent's table; the parent resolves against its own writer
	 * view.  Fall back to the parent view in the early init_child
	 * window where the snapshot has not yet been allocated.
	 */
	if (mypid() == mainpid) {
		table = parent_fd_hash;
	} else {
		struct childdata *child = this_child();

		table = (child != NULL && child->fd_hash != NULL)
			? child->fd_hash : parent_fd_hash;
	}

	slot = fd_hash_slot(fd);
	for (i = 0; i < FD_HASH_SIZE; i++) {
		int slot_fd = table[slot].fd;

		if (slot_fd == -1)
			return NULL;
		if (slot_fd == fd)
			return &table[slot];
		slot = (slot + 1) & (FD_HASH_SIZE - 1);
	}
	return NULL;
}

static bool is_fd_type(enum objecttype type)
{
	return type >= OBJ_FD_PIPE && type <= OBJ_FD_WATCH_QUEUE;
}

/*
 * Per-objhead fd→object hash for OBJ_LOCAL fd-typed pools.
 *
 * Open-addressing with linear probing into a fixed power-of-two slot array
 * (LOCAL_FD_HASH_SIZE).  fd == -1 marks empty.  The table lives in the
 * owning child's private heap — head->fd_hash itself sits in shm alongside
 * the rest of the objhead, but the buffer it points at is per-process and
 * unreachable from any other address space, the same shape head->array
 * uses for OBJ_LOCAL pools (objects.c:203-211).
 *
 * Replaces the O(n) linear walk over head->array in
 * find_local_object_by_fd() with a single hash probe.  That function is
 * called from register_returned_fd() on every successful RET_FD syscall
 * whose entry->ret_objtype is not OBJ_NONE (open, openat, socket, accept,
 * eventfd, timerfd, perf_event_open, io_uring_setup, memfd_create,
 * pidfd, fanotify_init, etc.), so the saving applies on the syscall hot
 * path with head->num_entries typically in the tens-to-low-hundreds.
 */
static unsigned int local_fd_hash_slot_idx(int fd)
{
	return (unsigned int)fd & (LOCAL_FD_HASH_SIZE - 1);
}

static void local_fd_hash_alloc(struct objhead *head)
{
	unsigned int i;

	head->fd_hash = malloc(LOCAL_FD_HASH_SIZE *
			       sizeof(struct local_fd_hash_slot));
	if (head->fd_hash == NULL)
		return;
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		head->fd_hash[i].fd = -1;
		head->fd_hash[i].obj = NULL;
	}
}

/*
 * Internal insert that does not check for an existing entry — used by
 * local_fd_hash_remove() to re-seat displaced entries after a removal.
 * The displaced entry's identity is unchanged, so the original (fd, obj)
 * pair is reinserted unconditionally into the first empty slot.
 */
static void local_fd_hash_reinsert(struct objhead *head, int fd,
				   struct object *obj)
{
	unsigned int slot, probe;

	slot = local_fd_hash_slot_idx(fd);
	for (probe = 0; probe < LOCAL_FD_HASH_SIZE; probe++) {
		if (head->fd_hash[slot].fd == -1) {
			head->fd_hash[slot].fd = fd;
			head->fd_hash[slot].obj = obj;
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
}

static void local_fd_hash_insert(struct objhead *head, int fd,
				 struct object *obj)
{
	unsigned int slot, probe;

	if (fd < 0)
		return;
	if (head->fd_hash == NULL) {
		local_fd_hash_alloc(head);
		if (head->fd_hash == NULL)
			return;
	}

	slot = local_fd_hash_slot_idx(fd);
	for (probe = 0; probe < LOCAL_FD_HASH_SIZE; probe++) {
		if (head->fd_hash[slot].fd == -1 ||
		    head->fd_hash[slot].fd == fd) {
			head->fd_hash[slot].fd = fd;
			head->fd_hash[slot].obj = obj;
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
	/*
	 * Table saturated.  Realistically unreachable — LOCAL_FD_HASH_SIZE
	 * sits well above any per-(child, type) pool we have observed —
	 * but if it ever happens the caller gracefully falls back to the
	 * uninserted state: find_local_object_by_fd() returns NULL and
	 * register_returned_fd() simply re-adds, which is the same outcome
	 * as the pre-hash linear walk missing the entry.  Bump a stat so
	 * the silent drop is observable in the end-of-run summary.
	 */
	__atomic_add_fetch(&shm->stats.local_fd_hash_insert_dropped, 1,
			   __ATOMIC_RELAXED);
}

static void local_fd_hash_remove(struct objhead *head, int fd)
{
	unsigned int slot, next, i;

	if (fd < 0 || head->fd_hash == NULL)
		return;

	slot = local_fd_hash_slot_idx(fd);
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		if (head->fd_hash[slot].fd == -1)
			return;
		if (head->fd_hash[slot].fd == fd) {
			head->fd_hash[slot].fd = -1;
			head->fd_hash[slot].obj = NULL;
			/*
			 * Linear-probing removal: re-seat any entries in the
			 * chain following us so a later lookup that hashes
			 * past this newly-empty slot still finds them.
			 */
			next = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
			while (head->fd_hash[next].fd != -1) {
				struct local_fd_hash_slot displaced =
					head->fd_hash[next];
				head->fd_hash[next].fd = -1;
				head->fd_hash[next].obj = NULL;
				local_fd_hash_reinsert(head, displaced.fd,
						       displaced.obj);
				next = (next + 1) & (LOCAL_FD_HASH_SIZE - 1);
			}
			return;
		}
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
}

static struct object *local_fd_hash_lookup(struct objhead *head, int fd)
{
	unsigned int slot, i;

	if (fd < 0 || head->fd_hash == NULL)
		return NULL;

	slot = local_fd_hash_slot_idx(fd);
	for (i = 0; i < LOCAL_FD_HASH_SIZE; i++) {
		if (head->fd_hash[slot].fd == -1)
			return NULL;
		if (head->fd_hash[slot].fd == fd)
			return head->fd_hash[slot].obj;
		slot = (slot + 1) & (LOCAL_FD_HASH_SIZE - 1);
	}
	return NULL;
}

/*
 * Every obj struct comes from alloc_object() (zmalloc) and lives in
 * the allocating process's private heap.  OBJ_GLOBAL pools are
 * populated pre-fork in the parent, then fork-COW'd into children's
 * snapshots; OBJ_LOCAL pools are wholly per-child.  No path crosses
 * the shared mapping for obj storage.
 */
struct object * alloc_object(void)
{
	heap_brk_maybe_refresh();
	return zmalloc_tracked(sizeof(struct object));
}

/*
 * Release an obj struct.  Routed through deferred_free_enqueue()
 * rather than free()'d immediately so a stale slot pointer that
 * survived past __destroy_object() lands on a chunk with a 5-50
 * syscall TTL (effective 80-800 with DEFERRED_TICK_BATCH) instead
 * of glibc-reclaimed memory: get_map() and friends read &obj->map
 * after taking the slot pointer out of head->array, and the arg-gen
 * path that invoked get_map() can hold the pointer across the
 * window in which the slot's owner destroys the obj.
 *
 * Zero the chunk before handing it to the deferred-free ring so a
 * post-destroy read (via a stale slot pointer) trips the size==0
 * band of consumer sanity checks instead of dereferencing an obj
 * whose name string or mmap pointer was already torn down by the
 * destructor.
 */
static void release_obj(struct object *obj,
			enum obj_scope scope __attribute__((unused)),
			enum objecttype type __attribute__((unused)))
{
	memset(obj, 0, sizeof(*obj));
	deferred_free_enqueue(obj);
}

struct objhead * get_objhead(enum obj_scope scope, enum objecttype type)
{
	struct objhead *head;

	if (scope == OBJ_GLOBAL) {
		/*
		 * Children resolve against their fork-time snapshot of the
		 * parent's pre-fork pool (allocated by
		 * clone_global_objects_to_child).  The parent's writer view
		 * lives in parent_global_objects[] in this file.  Fall back
		 * to the parent view in the early init_child window before
		 * the clone runs, so any incidental lookup still resolves.
		 */
		if (mypid() != mainpid) {
			struct childdata *child = this_child();

			if (child != NULL && child->global_objects != NULL)
				return &child->global_objects[type];
		}
		head = &parent_global_objects[type];
	} else {
		struct childdata *child;

		child = this_child();
		if (child == NULL)
			return NULL;
		head = &child->objects[type];
	}
	return head;
}


/*
 * Snapshot helper for the for_each_obj iterator macro.  Captures
 * num_entries and array into the caller's state struct so the loop
 * body operates on a per-invocation hoist rather than re-loading
 * head fields on every iteration.  No cross-process coherence is
 * required post-Stage-5 — every pool lives in the iterating
 * process's private heap.
 */
void __for_each_obj_init(struct objhead *head,
			 struct __for_each_obj_state *s)
{
	s->n_snap = head->num_entries;
	s->array_snap = head->array;

	if (s->array_snap == NULL)
		s->n_snap = 0;
}

/*
 * Fixed capacity for global object arrays.  These are allocated in
 * MAP_SHARED memory so children can safely read them.  Using realloc()
 * on private heap would put the new array in the parent's address space
 * only, causing children to SIGSEGV when they follow the pointer.
 *
 * Exposed in objects.h so other code (e.g. mm/maps.c) can use the
 * same upper bound when defending against a corrupt num_entries.
 */
/*
 * Marked noinline so __builtin_return_address(0) — used both in the
 * verbose-mode caller trace above and in the bad-fd / cap-overflow
 * outputerr paths below — names the actual add_object() callsite
 * rather than whatever frame the inliner chose to fold us into.
 * Caller attribution is the only reason those PCs are captured;
 * losing it to inlining defeats the diagnostic.
 */
__attribute__((noinline))
void add_object(struct object *obj, enum obj_scope scope, enum objecttype type)
{
	struct objhead *head;
	unsigned int n, cap;
	char pcbuf[128];

	if (unlikely(verbosity > 1)) {
		output(2, "ADD-OBJ slot=%p type=%d caller=%s\n", obj, type,
			pc_to_string(__builtin_return_address(0), pcbuf, sizeof(pcbuf)));
	}

	/*
	 * Reject obviously-corrupted fd values before they enter any pool.
	 * 1<<20 = 1048576 matches the kernel's NR_OPEN ceiling
	 * (include/uapi/linux/fs.h), the absolute upper bound RLIMIT_NOFILE
	 * may be raised to on every distro we exercise -- so any retval
	 * decoding to a value past this is a smoking-gun upper-bit
	 * corruption (sign-extended or wholesale-stomped rec->retval) that
	 * the existing "(long)retval >= 0" gate in register_returned_fd /
	 * the per-syscall .post handlers let through because the lower bits
	 * happened to be positive.
	 */
	if (is_fd_type(type)) {
		int fd = fd_from_object(obj, type);

		if (fd < 0 || fd >= (1 << 20)) {
			outputerr("add_object: rejecting out-of-bound fd=%d "
				  "type=%u caller=%s\n", fd, type,
				  pc_to_string(__builtin_return_address(0),
					       pcbuf, sizeof(pcbuf)));
			post_handler_corrupt_ptr_bump_site(NULL,
							   __builtin_return_address(0),
							   "add_object:fd");
			release_obj(obj, scope, type);
			return;
		}
	}

	/*
	 * Stamp the pool tag now that the obj has passed the fd-bound
	 * gate and is about to enter a pool.  Read back by
	 * objpool_check() in consumers (the post-2026-05-18 audit sweep
	 * across fds/ + syscalls/keyctl.c + childops/kvm-run-churn.c)
	 * to catch wild-obj-pointer derefs the loose 47-bit VA-range
	 * shape check lets through.  release_obj()'s memset zeroes the
	 * chunk on the way back to the deferred-free ring, which
	 * naturally invalidates the tag to OBJ_NONE for any future
	 * stale-pointer reader.
	 */
	obj->obj_type = type;

	/*
	 * OBJ_GLOBAL is pre-fork-only by construction: every provider
	 * REG_GLOBAL_OBJ init runs in the parent before fork_children(),
	 * and the per-child snapshot is taken at fork time.  A post-fork
	 * child that reached add_object(OBJ_GLOBAL) would mutate only its
	 * private copy with no benefit, so route the call to nowhere.
	 */
	if (scope == OBJ_GLOBAL && mypid() != mainpid) {
		release_obj(obj, scope, type);
		return;
	}

	head = get_objhead(scope, type);
	if (head == NULL) {
		release_obj(obj, scope, type);
		return;
	}

	n = head->num_entries;
	cap = head->array_capacity;

	if (scope == OBJ_GLOBAL) {
		if (n >= cap) {
			/*
			 * Grow on the parent's private heap.  No concurrent
			 * reader to coordinate with -- children see a snapshot
			 * pinned at fork time, so a post-fork grow in the
			 * parent is invisible to them and a pre-fork grow has
			 * no readers yet.
			 */
			struct object **newarray;
			unsigned int newcap = cap ? cap * 2 : 16;

			if (cap > UINT_MAX / 2) {
				outputerr("add_object: cap overflow type=%u num_entries=%u capacity=%u\n",
					  type, n, cap);
				if (is_fd_type(type)) {
					int fd = fd_from_object(obj, type);
					if (fd >= 0)
						close(fd);
				}
				release_obj(obj, scope, type);
				return;
			}
			newarray = zmalloc(newcap * sizeof(struct object *));
			if (newarray == NULL) {
				outputerr("add_object: malloc failed for type %u (cap %u)\n",
					  type, newcap);
				if (is_fd_type(type)) {
					int fd = fd_from_object(obj, type);
					if (fd >= 0)
						close(fd);
				}
				release_obj(obj, scope, type);
				return;
			}
			if (head->array != NULL && cap > 0)
				memcpy(newarray, head->array,
				       cap * sizeof(struct object *));
			free(head->array);
			head->array = newarray;
			head->array_capacity = newcap;
			cap = newcap;
		}
	} else if (n >= cap) {
		/*
		 * OBJ_LOCAL grow on the owning child's private heap.  Use
		 * the same allocate-copy-defer-free shape that closed the
		 * UAF on the array container reachable through cached
		 * head->array reads in the arg-gen path: the deferred-free
		 * ring gives the old chunk a 5-50 syscall (effective
		 * 80-800 with DEFERRED_TICK_BATCH) TTL, far longer than
		 * any in-flight reader's window.  Same hazard shape as
		 * the obj-struct fix (3a8d344f0f73, 546f576fae24).
		 */
		struct object **newarray;
		struct object **oldarray;
		unsigned int newcap = cap ? cap * 2 : 16;

		if (cap > UINT_MAX / 2) {
			outputerr("add_object: cap overflow type=%u num_entries=%u capacity=%u\n",
				  type, n, cap);
			if (is_fd_type(type)) {
				int fd = fd_from_object(obj, type);
				if (fd >= 0)
					close(fd);
			}
			release_obj(obj, scope, type);
			return;
		}
		newarray = zmalloc_tracked(newcap * sizeof(struct object *));
		if (newarray == NULL) {
			outputerr("add_object: malloc failed for type %u (cap %u)\n",
				  type, newcap);
			if (is_fd_type(type)) {
				int fd = fd_from_object(obj, type);
				if (fd >= 0)
					close(fd);
			}
			release_obj(obj, scope, type);
			return;
		}
		oldarray = head->array;
		if (oldarray != NULL && cap > 0)
			memcpy(newarray, oldarray, cap * sizeof(struct object *));
		head->array = newarray;
		head->array_capacity = newcap;
		cap = newcap;
		if (oldarray != NULL)
			deferred_free_enqueue(oldarray);
	}

	head->array[n] = obj;
	obj->array_idx = n;
	/*
	 * Stamp the per-pool monotonic identity tag.  Pre-increment so
	 * the first issued value is 1; the zero left by release_obj()'s
	 * memset on a freed obj is reserved as a never-issued sentinel.
	 * Stamped after the slot-array insert and the array_idx assign
	 * so any consumer that re-reads obj fields off head->array sees
	 * a fully populated obj as soon as num_entries below admits it.
	 */
	obj->slot_version = ++head->next_slot_version;
	/*
	 * Stamp the publish-time fleet op tick from the child-readable
	 * mirror page.  parent_stats.op_count is MAP_PRIVATE heap so
	 * a child COW-copy goes stale immediately after fork; the
	 * shm_published mirror is the republished, child-visible copy
	 * of the same counter.  No current reader -- pre-stage field
	 * for the upcoming diag-drain consumer.
	 */
	obj->publish_call_nr = shm_published ? shm_published->fleet_op_count : 0;
	head->num_entries = n + 1;

	/* Mirror the parent-side global fd hash for OBJ_LOCAL fd-typed
	 * pools so find_local_object_by_fd() resolves in O(1).  The buffer
	 * is lazily allocated by local_fd_hash_insert() on first use. */
	if (scope == OBJ_LOCAL && is_fd_type(type)) {
		int fd = fd_from_object(obj, type);

		if (fd >= 0)
			local_fd_hash_insert(head, fd, obj);
	}

	/* Track global fd-type objects in the parent's fd_hash so
	 * remove_object_by_fd() and the per-child snapshot can resolve
	 * them by fd. */
	if (scope == OBJ_GLOBAL && is_fd_type(type)) {
		int fd = fd_from_object(obj, type);

		if (!fd_hash_insert(fd, obj, type)) {
			outputerr("add_object: fd hash full for type %u, dropping fd %d\n",
				  type, fd);
			head->num_entries = n;
			head->array[n] = NULL;
			if (fd >= 0)
				close(fd);
			release_obj(obj, scope, type);
			return;
		}

		/* Per-provider outstanding-fd gauge: bump on successful
		 * registration into the parent's global fd_hash.  Paired
		 * with the decrement in fd_event_drain()'s CLOSE arm,
		 * which looks the type back up via fd_hash_lookup() on
		 * the consumer side. */
		__atomic_fetch_add(&shm->stats.fd_provider_outstanding[type],
				   1, __ATOMIC_RELAXED);
	}

	/* Per-object dumps are debug noise at startup (NFUTEXES = 5 * cpus
	 * identical "futex: 0 owner:0 scope:1" lines, etc.).  Gate on -vv. */
	if (head->dump != NULL && verbosity > 2)
		head->dump(obj, scope);

	/* if we just added something to a child list, check
	 * to see if we need to do some pruning. */
	if (scope == OBJ_LOCAL)
		prune_objects();
}

/*
 * Lazy per-child alloc for the OBJ_LOCAL objhead array, in the owning
 * child's private heap.  Runs from init_child() after fork, so the
 * allocation lands in the child's own address space and is unreachable
 * from any other process.  Failure leaves child->objects == NULL and
 * the OBJ_LOCAL path inert for this child -- callers must NULL-check
 * before touching child->objects.
 */
static void local_objects_alloc(struct childdata *child)
{
	if (child == NULL || child->objects != NULL)
		return;

	child->objects = zmalloc(sizeof(struct objhead) * MAX_OBJECT_TYPES);
}

void init_object_lists(enum obj_scope scope, struct childdata *child)
{
	unsigned int i;

	if (scope == OBJ_LOCAL) {
		if (child == NULL)
			return;
		/*
		 * struct childdata lives in alloc_shared() memory, which
		 * __alloc_shared() poisons with random bytes to expose
		 * uninitialised reads.  The objects pointer therefore
		 * arrives at first init holding a wild value, not NULL --
		 * local_objects_alloc()'s "skip if non-NULL" guard would
		 * then leave child->objects pointing at the poison.  Zero
		 * the field before the alloc to neutralise the poison.
		 */
		child->objects = NULL;
		local_objects_alloc(child);
		if (child->objects == NULL)
			return;
	}

	for (i = 0; i < MAX_OBJECT_TYPES; i++) {
		struct objhead *head;

		if (scope == OBJ_GLOBAL)
			head = &parent_global_objects[i];
		else
			head = &child->objects[i];

		head->num_entries = 0;
		head->array = NULL;
		head->array_capacity = 0;
		head->fd_hash = NULL;
		head->next_slot_version = 0;

		/*
		 * child lists can inherit properties from global lists.
		 */
		if (scope == OBJ_LOCAL) {
			struct objhead *globalhead;
			globalhead = &parent_global_objects[i];
			head->max_entries = globalhead->max_entries;
			head->destroy = globalhead->destroy;
			head->dump = globalhead->dump;
		}
	}
}

/*
 * Lift the parent's pre-fork OBJ_GLOBAL pool into the owning child's
 * private heap.  The parent populates shm->global_objects[] in
 * init_global_objects() before fork; each child then runs this routine
 * from init_child() to take a shallow snapshot of the head fields and
 * the live slot pointers into the child's own zmalloc'd backing.
 *
 * Bookkeeping only.  The obj structs themselves (and the kernel-side
 * fds / mmap regions they describe) are reached via fork's table dup
 * and the existing MAP_SHARED obj heap that backs the parent's pool —
 * snapshotting the directory of pointers is sufficient for the child
 * to pick, dereference and locally destroy entries without crossing
 * back into shared memory.
 *
 * Per-type array allocation is sized to the parent's current
 * num_entries rather than the pre-fork GLOBAL_OBJ_MAX_CAPACITY ceiling
 * so an empty pool costs zero heap bytes here and a small pool costs
 * exactly num_entries pointers, keeping the per-child memory cost
 * proportional to the live working set.
 *
 * NULL on out-of-memory leaves child->global_objects unset so the
 * get_objhead() fallback to shm->global_objects[] is selected for
 * this child's lifetime; the OBJ_GLOBAL path degrades to its pre-
 * lift behaviour rather than crashing.
 */
void clone_global_objects_to_child(struct childdata *child)
{
	unsigned int i;

	if (child == NULL)
		return;

	child->global_objects = NULL;
	child->fd_hash = NULL;
	child->fd_live = NULL;
	child->fd_hash_count = 0;
	child->fd_live_count = 0;

	child->global_objects = zmalloc(sizeof(struct objhead) * MAX_OBJECT_TYPES);
	if (child->global_objects == NULL)
		return;

	for (i = 0; i < MAX_OBJECT_TYPES; i++) {
		struct objhead *src = &parent_global_objects[i];
		struct objhead *dst = &child->global_objects[i];
		unsigned int n = src->num_entries;

		dst->max_entries = src->max_entries;
		dst->destroy = src->destroy;
		dst->dump = src->dump;
		dst->num_entries = n;
		dst->array_capacity = n;
		dst->fd_hash = NULL;
		dst->array = NULL;
		/*
		 * Carry the parent's next_slot_version into the child snapshot
		 * so any captured (obj, version) pair stashed by a consumer
		 * pre-fork continues to compare correctly against entries the
		 * child sees post-fork.  The child never adds to OBJ_GLOBAL
		 * (the mypid()!=mainpid early-return in add_object()) so the
		 * snapshot value is read-only on the child side; copying it
		 * just keeps the field self-consistent rather than starting
		 * the child's mirror at zero.
		 */
		dst->next_slot_version = src->next_slot_version;

		if (n == 0 || src->array == NULL)
			continue;

		dst->array = zmalloc(n * sizeof(struct object *));
		if (dst->array == NULL) {
			dst->array_capacity = 0;
			dst->num_entries = 0;
			continue;
		}
		memcpy(dst->array, src->array, n * sizeof(struct object *));
	}

	child->fd_hash = zmalloc(FD_HASH_SIZE * sizeof(struct fd_hash_entry));
	if (child->fd_hash != NULL) {
		memcpy(child->fd_hash, parent_fd_hash,
		       FD_HASH_SIZE * sizeof(struct fd_hash_entry));
		child->fd_hash_count = parent_fd_hash_count;
	}

	child->fd_live = zmalloc(FD_LIVE_MAX * sizeof(int));
	if (child->fd_live != NULL) {
		memcpy(child->fd_live, parent_fd_live, FD_LIVE_MAX * sizeof(int));
		child->fd_live_count = parent_fd_live_count;
	}
}

/*
 * Pick a random object from a pool.  Single-writer per pool, single
 * reader per call (the owning process) -- no locks, no version
 * counters, no snapshot defences.  Children read their fork-time
 * snapshot of the parent's pre-fork OBJ_GLOBAL pool; OBJ_LOCAL pools
 * are wholly per-child.  An empty pool returns NULL.
 */
struct object * get_random_object(enum objecttype type, enum obj_scope scope)
{
	struct objhead *head;
	unsigned int n;

	head = get_objhead(scope, type);
	if (head == NULL)
		return NULL;

	n = head->num_entries;
	if (n == 0 || head->array == NULL)
		return NULL;

	return head->array[rnd_modulo_u32(n)];
}

bool objects_empty(enum objecttype type)
{
	struct objhead *head = get_objhead(OBJ_GLOBAL, type);

	if (head == NULL)
		return true;
	return head->num_entries == 0;
}

bool objects_pool_empty(enum obj_scope scope, enum objecttype type)
{
	struct objhead *head = get_objhead(scope, type);

	if (head == NULL)
		return true;
	return head->num_entries == 0;
}

/*
 * Invalidate the fd stored in an object by setting it to -1.
 * Used before calling the destructor when the fd was already closed
 * (e.g. after a successful close() syscall) to prevent double-close.
 * The destructor's close(-1) call will harmlessly return EBADF.
 */
static void invalidate_object_fd(struct object *obj, enum objecttype type)
{
	switch (type) {
	case OBJ_FD_PIPE:	obj->pipeobj.fd = -1; break;
	case OBJ_FD_DEVFILE:	obj->fileobj.fd = -1; break;
	case OBJ_FD_DEV_TEMPLATE: obj->fileobj.fd = -1; break;
	case OBJ_FD_PROCFILE:	obj->fileobj.fd = -1; break;