Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 28 additions & 32 deletions dev/TODO.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
# TODO

## High priority: Instrument with OpenTelemetry — in progress

- [ ] Add OTel-native traces, metrics, and wide log events in `src-py-lib`.
- [ ] Add shared OTel bootstrap config/helpers with `--otel` and standard
`OTEL_*` env-var-backed CLI args.
- [ ] Replace custom trace-context propagation with OTel W3C propagation.
- [ ] Instrument shared HTTP and GraphQL clients manually, preserving safe
sanitized attributes and Sourcegraph-specific metadata.
- [ ] Rename Sourcegraph debug tracing from `--trace` to `--fetch-sg-traces`.
- [ ] Wire `src-auth-perms-sync` to the shared OTel bootstrap without doing
import-time logger/provider setup.
- [ ] Verify pyright, tests, and CLI help in both repos.
## High priority: Sync modes

### Fast

- Additive modes, to add new users’ perms quickly,
without the extraneous load on the database of a full sync
- Take a list of usernames and/or email addresses as input,
query users on the instance for these,
then trigger a perms sync for found users
- Query the instance for all new users, which do not yet have explicit perms
- Query the instance for all new repos, which do not yet have explicit perms

### Full: Overwrite all perms

- Separate full sync mode with an arg

## High priority: Remote trigger on demand

- Sourcegraph webhook for new user coming in v7.4.0
- Requested a webhook for new repos
- Receive the webhook event
- Parse the new user / repo name
- Run a lightweight sync for the changed user / repo

- Where does this run? Sidecar in the customer's environment? CI job?
Sourcegraph executor?
- How do we avoid stampedes (e.g., bulk repo sync triggering thousands
of re-runs)?

## High priority: End to End test cases

Expand All @@ -38,26 +54,6 @@
replace aliased `User.permissionsInfo.repositories(source: API)` calls before
raising concurrency further.

## Medium priority: Lightweight incremental updates

- When a new user's account is created, or a new repo is synced from a code host,
they sit outside the mapping until this script runs again
- Find ways to create lightweight run modes to check for new users / repos,
and create the needed perms for them

## Medium priority: Remote trigger on demand

- Sourcegraph webhook for new user coming in v7.4.0
- Requested a webhook for new repos
- Receive the webhook event
- Parse the new user / repo name
- Run a lightweight sync for the changed user / repo

- Where does this run? Sidecar in the customer's environment? CI job?
Sourcegraph executor?
- How do we avoid stampedes (e.g., bulk repo sync triggering thousands
of re-runs)?

## Low priority: Repo-centric path, when users > repos, or for cross-checking

We previously had a repo-centric capture path
Expand Down
16 changes: 8 additions & 8 deletions dev/memory-efficiency-generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def main() -> int:
explicit_permissions_batch_size=arguments.explicit_permissions_batch_size,
http_timeout_seconds=arguments.http_timeout_seconds,
sample_interval=arguments.sample_interval,
trace=arguments.trace,
fetch_sg_traces=arguments.fetch_sg_traces,
sourcegraph_user_count=total_user_count,
sourcegraph_inventory_repo_count=inventory_repo_count,
)
Expand Down Expand Up @@ -371,9 +371,9 @@ def build_parser() -> argparse.ArgumentParser:
help="CLI --sample-interval for resource samples (default: 1).",
)
parser.add_argument(
"--trace",
"--fetch-sg-traces",
action="store_true",
help="Pass --trace to src-auth-perms-sync sweep runs.",
help="Pass --fetch-sg-traces to src-auth-perms-sync sweep runs.",
)
return parser

Expand Down Expand Up @@ -841,7 +841,7 @@ def run_sweep(
explicit_permissions_batch_size: int,
http_timeout_seconds: float,
sample_interval: float,
trace: bool,
fetch_sg_traces: bool,
sourcegraph_user_count: int,
sourcegraph_inventory_repo_count: int,
) -> list[CommandRunResult]:
Expand All @@ -858,7 +858,7 @@ def run_sweep(
explicit_permissions_batch_size=explicit_permissions_batch_size,
http_timeout_seconds=http_timeout_seconds,
sample_interval=sample_interval,
trace=trace,
fetch_sg_traces=fetch_sg_traces,
)
environment = command_environment(endpoint, access_token)
process = subprocess.run(
Expand Down Expand Up @@ -911,7 +911,7 @@ def command_arguments(
explicit_permissions_batch_size: int,
http_timeout_seconds: float,
sample_interval: float,
trace: bool,
fetch_sg_traces: bool,
) -> list[str]:
arguments = [
*shlex.split(command),
Expand All @@ -932,8 +932,8 @@ def command_arguments(
arguments.append("--apply")
if mode == "apply-no-backup":
arguments.append("--no-backup")
if trace:
arguments.append("--trace")
if fetch_sg_traces:
arguments.append("--fetch-sg-traces")
return arguments


Expand Down
2 changes: 1 addition & 1 deletion dev/memory-efficiency-monitor-sourcegraph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Examples:
--output-dir /tmp/src-auth-perms-sync-load-$(date -u +%Y%m%d-%H%M%S)

In another terminal, run:
uv run python dev/test-end-to-end.py --trace --sample-interval 0 --external-sample-interval 0
uv run python dev/test-end-to-end.py --fetch-sg-traces --sample-interval 0 --external-sample-interval 0
EOF
}

Expand Down
20 changes: 11 additions & 9 deletions dev/memory-efficiency.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ bulk explicit-permissions APIs.

## Capture a focused trace

Run a small command with `--trace`. This sends `X-Sourcegraph-Should-Trace` and
a sampled W3C `traceparent` header on each GraphQL request. Sourcegraph may
also return `x-trace`, `x-trace-span`, and `x-trace-url` response headers.
Run a small command with `--fetch-sg-traces`. This sends
`X-Sourcegraph-Request-Trace` and a W3C `traceparent` header on each GraphQL
request. Sourcegraph may also return `x-trace`, `x-trace-span`, and
`x-trace-url` response headers.

```bash
uv run src-auth-perms-sync get \
--trace \
--fetch-sg-traces \
--sample-interval 0 \
--parallelism 2 \
--explicit-permissions-batch-size 25
Expand Down Expand Up @@ -87,13 +88,14 @@ artifacts. Keep them in `/tmp` unless a human asks to preserve them.

## Trace the end-to-end matrix

Prefer the end-to-end runner as the single orchestrator. With `--trace`, it
passes tracing to every child CLI command, tails child JSON logs, and fetches
Jaeger traces in the background while each child command is still running.
Prefer the end-to-end runner as the single orchestrator. With
`--fetch-sg-traces`, it passes Sourcegraph debug trace collection to every child
CLI command, tails child JSON logs, and fetches Jaeger traces in the background
while each child command is still running.

```bash
uv run python dev/test-end-to-end.py \
--trace \
--fetch-sg-traces \
--sample-interval 0 \
--external-sample-interval 0 \
--results-json /tmp/src-auth-perms-sync-end-to-end-trace.json \
Expand Down Expand Up @@ -136,7 +138,7 @@ artifact paths into the result JSON:

```bash
uv run python dev/test-end-to-end.py \
--trace \
--fetch-sg-traces \
--monitor-sourcegraph-load \
--sample-interval 0 \
--external-sample-interval 0 \
Expand Down
32 changes: 17 additions & 15 deletions dev/test-end-to-end.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,12 @@ class EndToEndConfig(src.SourcegraphClientConfig, src.LoggingConfig):
cli_action="store_true",
help="Continue after assertion failures where it is safe to do so",
)
trace: bool = src.config_field(
fetch_sg_traces: bool = src.config_field(
default=False,
env_var="SRC_AUTH_PERMS_SYNC_E2E_TRACE",
cli_flag="--trace",
env_var="SRC_AUTH_PERMS_SYNC_E2E_FETCH_SG_TRACES",
cli_flag="--fetch-sg-traces",
cli_action="store_true",
help="Pass --trace to each child src-auth-perms-sync command",
help="Pass --fetch-sg-traces to each child src-auth-perms-sync command",
)
jaeger_trace_limit: int | None = src.config_field(
default=DEFAULT_JAEGER_TRACE_LIMIT,
Expand All @@ -205,7 +205,8 @@ class EndToEndConfig(src.SourcegraphClientConfig, src.LoggingConfig):
metavar="N",
ge=0,
help=(
"When --trace is set, fetch and summarize the N slowest GraphQL Jaeger traces "
"When --fetch-sg-traces is set, fetch and summarize the N slowest GraphQL "
"Jaeger traces "
"while each child command runs; omit for all traces, set 0 to disable"
),
)
Expand All @@ -216,7 +217,7 @@ class EndToEndConfig(src.SourcegraphClientConfig, src.LoggingConfig):
metavar="N",
ge=1,
help=(
"Concurrent Jaeger trace fetch requests when --trace is set "
"Concurrent Jaeger trace fetch requests when --fetch-sg-traces is set "
f"(default: {DEFAULT_JAEGER_TRACE_PARALLELISM})"
),
)
Expand All @@ -238,7 +239,7 @@ class EndToEndConfig(src.SourcegraphClientConfig, src.LoggingConfig):
metavar="PATH",
help=(
"Write Jaeger trace summaries incrementally as JSON Lines. Defaults to a sibling "
"of --results-json or --results-csv when --trace is set."
"of --results-json or --results-csv when --fetch-sg-traces is set."
),
)
jaeger_trace_directory: Path | None = src.config_field(
Expand All @@ -248,7 +249,8 @@ class EndToEndConfig(src.SourcegraphClientConfig, src.LoggingConfig):
metavar="PATH",
help=(
"Directory where complete raw Jaeger trace JSON files are written. Defaults "
"to a sibling directory of --results-json or --results-csv when --trace is set."
"to a sibling directory of --results-json or --results-csv when --fetch-sg-traces "
"is set."
),
)
jaeger_retry_delays: tuple[float, ...] = src.config_field(
Expand Down Expand Up @@ -1068,7 +1070,7 @@ def __init__(
*,
iteration: int,
keep_going: bool,
trace: bool,
fetch_sg_traces: bool,
jaeger_trace_limit: int | None,
jaeger_trace_fetch_pool: JaegerTraceFetchPool | None,
sample_interval: float,
Expand All @@ -1078,7 +1080,7 @@ def __init__(
self.environment = environment
self.iteration = iteration
self.keep_going = keep_going
self.trace = trace
self.fetch_sg_traces = fetch_sg_traces
self.jaeger_trace_limit = jaeger_trace_limit
self.jaeger_trace_fetch_pool = jaeger_trace_fetch_pool
self.sample_interval = sample_interval
Expand Down Expand Up @@ -1106,7 +1108,7 @@ def _run_process(self, case: CommandCase) -> CommandResult:
full_command = [
*self.variant.executable,
*case.arguments,
*(("--trace",) if self.trace else ()),
*(("--fetch-sg-traces",) if self.fetch_sg_traces else ()),
"--sample-interval",
str(self.sample_interval),
]
Expand Down Expand Up @@ -1306,11 +1308,11 @@ def run_end_to_end(config: EndToEndConfig) -> None:
try:
if sourcegraph_load_monitor is not None:
sourcegraph_load_monitor.start()
with src.event(
with src.span(
"end_to_end_matrix",
repeat=config.repeat,
variant_count=len(variants),
trace=config.trace,
fetch_sg_traces=config.fetch_sg_traces,
sourcegraph_load_monitor=sourcegraph_load_monitor is not None,
) as matrix_summary:
if sourcegraph_load_monitor is not None:
Expand All @@ -1325,7 +1327,7 @@ def run_end_to_end(config: EndToEndConfig) -> None:
environment,
iteration=iteration,
keep_going=config.keep_going,
trace=config.trace,
fetch_sg_traces=config.fetch_sg_traces,
jaeger_trace_limit=config.jaeger_trace_limit,
jaeger_trace_fetch_pool=jaeger_trace_fetch_pool,
sample_interval=config.sample_interval,
Expand Down Expand Up @@ -1382,7 +1384,7 @@ def create_jaeger_trace_fetch_pool(
config: EndToEndConfig,
) -> JaegerTraceFetchPool | None:
"""Return the shared trace fetch pool for this run, if trace collection is enabled."""
if not config.trace or config.jaeger_trace_limit == 0:
if not config.fetch_sg_traces or config.jaeger_trace_limit == 0:
return None
return JaegerTraceFetchPool(
config,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ classifiers = [
dependencies = [
"json5>=0.14.0",
"pyyaml>=6.0.3",
"src-py-lib==0.1.9",
"src-py-lib[otel]==0.2.1",
]
keywords = [
"Sourcegraph"
Expand Down
23 changes: 15 additions & 8 deletions src/src_auth_perms_sync/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@
COMMON_CONFIG_FIELDS = src.config_field_names(
src.SourcegraphClientConfig,
src.LoggingConfig,
src.OpenTelemetryConfig,
"parallelism",
"http_timeout_seconds",
"max_attempts",
"sample_interval",
"trace",
"fetch_sg_traces",
)
GET_CONFIG_FIELDS = src.config_field_names(
"users",
Expand Down Expand Up @@ -146,7 +147,7 @@ class CliCommand:
config_fields: tuple[str, ...]


class Config(src.SourcegraphClientConfig, src.LoggingConfig):
class Config(src.SourcegraphClientConfig, src.LoggingConfig, src.OpenTelemetryConfig):
"""Config values loaded from defaults, .env, environment, and CLI flags."""

maps_path: Path | None = src.config_field(
Expand Down Expand Up @@ -276,12 +277,12 @@ class Config(src.SourcegraphClientConfig, src.LoggingConfig):
help="Seconds between logging compute resource samples; set 0 to disable (default: 10)",
help_group="Runtime",
)
trace: bool = src.config_field(
fetch_sg_traces: bool = src.config_field(
default=False,
env_var="SRC_AUTH_PERMS_SYNC_TRACE",
cli_flag="--trace",
env_var="SRC_AUTH_PERMS_SYNC_FETCH_SG_TRACES",
cli_flag="--fetch-sg-traces",
cli_action="store_true",
help=("Ask Sourcegraph to retain traces for GraphQL requests and return trace metadata"),
help="Ask Sourcegraph to retain GraphQL traces and return debug trace metadata",
help_group="Runtime",
)

Expand Down Expand Up @@ -515,7 +516,8 @@ def run_fields(config: Config, command: ResolvedCommand, endpoint: str) -> dict[
"endpoint": endpoint,
"parallelism": config.parallelism,
"explicit_permissions_batch_size": config.explicit_permissions_batch_size,
"trace": config.trace,
"fetch_sg_traces": config.fetch_sg_traces,
"open_telemetry": config.open_telemetry,
"max_attempts": config.max_attempts,
"http_timeout_seconds": config.http_timeout_seconds,
"no_backup": config.no_backup,
Expand Down Expand Up @@ -544,7 +546,7 @@ def run_with_client(
endpoint=endpoint,
token=config.src_access_token,
http=http,
trace=config.trace,
fetch_sg_traces=config.fetch_sg_traces,
)
try:
run_command(config, command, client, worker_pool)
Expand Down Expand Up @@ -754,6 +756,11 @@ def _run_or_raise(command_name: CommandName, config: Config) -> None:
log_file=backups.run_log_path(run_directory),
logs_dir=None,
resource_sample_interval_seconds=config.sample_interval,
open_telemetry=src.open_telemetry_settings_from_config(
config,
force_traces=config.fetch_sg_traces,
service_name="src-auth-perms-sync",
),
)

with (
Expand Down
Loading