diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c50c10a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +# Automatically strip outputs from Jupyter notebooks +*.ipynb filter=nbstripout +*.ipynb diff=ipynb + +#qqqq requires pip install nbstripout, nbstripout --install \ No newline at end of file diff --git a/.github/workflows/prod-pr.yml b/.github/workflows/prod-pr.yml index c393c88..ee525b6 100644 --- a/.github/workflows/prod-pr.yml +++ b/.github/workflows/prod-pr.yml @@ -4,7 +4,6 @@ on: pull_request: branches: - main - # Trigger on PRs targeting the dev branch jobs: validate-prod: diff --git a/.gitignore b/.gitignore index ae49032..2826587 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,10 @@ pip-selfcheck.json # ----------------------------- # Jupyter Notebooks # ----------------------------- +# To stop printing of data resulting in it being in source control .ipynb_checkpoints/ +*-checkpoint.ipynb +*.log # ----------------------------- # Scratch / experimental folder @@ -34,6 +37,14 @@ pip-selfcheck.json scratch/** # ignore all files in scratch !scratch/README.md # except placeholder README.md +## Data files (we may decide not to do this but will try it early on so its opt in) +*.csv +*.parquet +*.xlsx +## Databricks +.databricks/ +*.dbc + # ----------------------------- # IDE / editor # ----------------------------- diff --git a/conftest.py-comebackto.txt b/conftest.py-comebackto.txt deleted file mode 100644 index fb99be0..0000000 --- a/conftest.py-comebackto.txt +++ /dev/null @@ -1,99 +0,0 @@ -# copy paste from [Dab repo examples](https://github.com/databricks/bundle-examples/blob/1cf3dba30a897d68e3e74ab17f0a3dff68392f15/default_python/tests/conftest.py) -"""This file configures pytest. - -This file is in the root since it can be used for tests in any place in this -project, including tests under resources/. -""" - -import os, sys, pathlib -from contextlib import contextmanager - - -try: - from databricks.connect import DatabricksSession - from databricks.sdk import WorkspaceClient - from pyspark.sql import SparkSession - import pytest - import json - import csv - import os -except ImportError: - raise ImportError( - "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." - ) - - -@pytest.fixture() -def spark() -> SparkSession: - """Provide a SparkSession fixture for tests. - - Minimal example: - def test_uses_spark(spark): - df = spark.createDataFrame([(1,)], ["x"]) - assert df.count() == 1 - """ - return DatabricksSession.builder.getOrCreate() - - -@pytest.fixture() -def load_fixture(spark: SparkSession): - """Provide a callable to load JSON or CSV from fixtures/ directory. - - Example usage: - - def test_using_fixture(load_fixture): - data = load_fixture("my_data.json") - assert data.count() >= 1 - """ - - def _loader(filename: str): - path = pathlib.Path(__file__).parent.parent / "fixtures" / filename - suffix = path.suffix.lower() - if suffix == ".json": - rows = json.loads(path.read_text()) - return spark.createDataFrame(rows) - if suffix == ".csv": - with path.open(newline="") as f: - rows = list(csv.DictReader(f)) - return spark.createDataFrame(rows) - raise ValueError(f"Unsupported fixture type for: {filename}") - - return _loader - - -def _enable_fallback_compute(): - """Enable serverless compute if no compute is specified.""" - conf = WorkspaceClient().config - if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): - return - - url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" - print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) - print(f" see {url} for manual configuration", file=sys.stdout) - - os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" - - -@contextmanager -def _allow_stderr_output(config: pytest.Config): - """Temporarily disable pytest output capture.""" - capman = config.pluginmanager.get_plugin("capturemanager") - if capman: - with capman.global_and_fixture_disabled(): - yield - else: - yield - - -def pytest_configure(config: pytest.Config): - """Configure pytest session.""" - with _allow_stderr_output(config): - _enable_fallback_compute() - - # Initialize Spark session eagerly, so it is available even when - # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, - # we validate version compatibility with the remote cluster. - if hasattr(DatabricksSession.builder, "validateSession"): - DatabricksSession.builder.validateSession().getOrCreate() - else: - DatabricksSession.builder.getOrCreate() \ No newline at end of file diff --git a/conftest.py-disablefornow b/conftest.py-disablefornow deleted file mode 100644 index fb99be0..0000000 --- a/conftest.py-disablefornow +++ /dev/null @@ -1,99 +0,0 @@ -# copy paste from [Dab repo examples](https://github.com/databricks/bundle-examples/blob/1cf3dba30a897d68e3e74ab17f0a3dff68392f15/default_python/tests/conftest.py) -"""This file configures pytest. - -This file is in the root since it can be used for tests in any place in this -project, including tests under resources/. -""" - -import os, sys, pathlib -from contextlib import contextmanager - - -try: - from databricks.connect import DatabricksSession - from databricks.sdk import WorkspaceClient - from pyspark.sql import SparkSession - import pytest - import json - import csv - import os -except ImportError: - raise ImportError( - "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." - ) - - -@pytest.fixture() -def spark() -> SparkSession: - """Provide a SparkSession fixture for tests. - - Minimal example: - def test_uses_spark(spark): - df = spark.createDataFrame([(1,)], ["x"]) - assert df.count() == 1 - """ - return DatabricksSession.builder.getOrCreate() - - -@pytest.fixture() -def load_fixture(spark: SparkSession): - """Provide a callable to load JSON or CSV from fixtures/ directory. - - Example usage: - - def test_using_fixture(load_fixture): - data = load_fixture("my_data.json") - assert data.count() >= 1 - """ - - def _loader(filename: str): - path = pathlib.Path(__file__).parent.parent / "fixtures" / filename - suffix = path.suffix.lower() - if suffix == ".json": - rows = json.loads(path.read_text()) - return spark.createDataFrame(rows) - if suffix == ".csv": - with path.open(newline="") as f: - rows = list(csv.DictReader(f)) - return spark.createDataFrame(rows) - raise ValueError(f"Unsupported fixture type for: {filename}") - - return _loader - - -def _enable_fallback_compute(): - """Enable serverless compute if no compute is specified.""" - conf = WorkspaceClient().config - if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): - return - - url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" - print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) - print(f" see {url} for manual configuration", file=sys.stdout) - - os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" - - -@contextmanager -def _allow_stderr_output(config: pytest.Config): - """Temporarily disable pytest output capture.""" - capman = config.pluginmanager.get_plugin("capturemanager") - if capman: - with capman.global_and_fixture_disabled(): - yield - else: - yield - - -def pytest_configure(config: pytest.Config): - """Configure pytest session.""" - with _allow_stderr_output(config): - _enable_fallback_compute() - - # Initialize Spark session eagerly, so it is available even when - # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, - # we validate version compatibility with the remote cluster. - if hasattr(DatabricksSession.builder, "validateSession"): - DatabricksSession.builder.validateSession().getOrCreate() - else: - DatabricksSession.builder.getOrCreate() \ No newline at end of file diff --git a/databricks.yml b/databricks.yml index c7ac9e9..2b44896 100644 --- a/databricks.yml +++ b/databricks.yml @@ -72,7 +72,7 @@ variables: default: "" # to be used in dev only so staging prod have no prefix value description: To prefix dev user schema names to allow multiple dev users to use same catalog with their own different schema names for development env_name: - description: The environment name (dev, staging, prod) + description: The environment name (personal, dev, staging, prod) storage_account: description: Seperate databricks workspaces dedicated storage (dev, staging, prod) # Storage principles id is used for name field in databricks.yml name the sp variable: _id @@ -116,9 +116,9 @@ targets: host: https://adb-295718430158257.17.azuredatabricks.net # Dev root path under each users home directory # because deploying as service principle -> /Workspace/Users/b0326488-f6b0-4c0f-bf20-a091224bad83/.bundle/DatabricksPOC so we need to decide if to personal area by own username or to a shared dev space - root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev + root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/personal variables: - env_name: dev + env_name: personal catalog: dev_catalog # only dev has schema prefix as its per user #e.g. phil_ will become phil_bronze_ods if we do layer and domain? as our schema naming convention @@ -139,6 +139,7 @@ targets: # qqqq should we have a personal section for deploying to own space ??? dev: # production because cicd is not going to deploying to workspaces it doesnt know the user + # but also that will mean its more intensive so shouldnt it be dev but we are using sp qqqq mode: production default: true workspace: @@ -188,6 +189,13 @@ targets: # prod databricks host host: https://adb-295718430158257.17.azuredatabricks.net root_path: /Workspace/.bundle/${bundle.name}/prod + # Exclude development and test artifacts we are not testing prod + sync: + exclude: + - "tests/*" + - "resources/test/*" + - "scratch/*" + - "setup/*" #setup is instructions and manual setup variables: # when 3 databricks it will be a share catalog name across the databricks env_name: prod diff --git a/docs/deployment_guide.md b/docs/deployment_guide.md index e69de29..1a9f5ec 100644 --- a/docs/deployment_guide.md +++ b/docs/deployment_guide.md @@ -0,0 +1,25 @@ +# Deployment Guide + +The current setup is a POC. We may decide to reduce the number of steps, but it is to support decision making. + +DABs are deployed to 4 contexts. + +**Personal** which is in your own user area folder e.g. users/first.lastname:nhs.net +This location has your branch folder in it and it has your bundle folder in it. + +To deploy a DAB to your personal bundle folder: + +Go to your personal folder +There will be an edit link to view as editable +To the right select the rocket +Select source for the bundle code to come from, this will be your repo +Target personal +Then click deploy + +You should not deploy to the other targets from the ui. The other locations are shared so tests and git versioning should run first. Deploying to the other is done by merging to Dev, staging,prod git branches, which triggers Dab deployment processes using service principles. + +**DEV** + +After commiting and pushing follow the pull request link in databricks + + diff --git a/pytest.ini.-use toml instead.txt b/pytest.ini.-use toml instead.txt deleted file mode 100644 index 78b40db..0000000 --- a/pytest.ini.-use toml instead.txt +++ /dev/null @@ -1,30 +0,0 @@ -# qqqq todo ai generated ask someone to scan over it and give a read of the ins and outs -[pytest] -# Pytest configuration for Databricks unit tests - -# Test discovery patterns -python_files = test_*.py -python_classes = Test* -python_functions = test_* - -# Test paths -testpaths = tests - -# Output options -addopts = - -v - --tb=short - --strict-markers - --disable-warnings - -# Markers for organizing tests -markers = - unit: Unit tests that don't require external resources - integration: Integration tests that may require external systems - slow: Tests that take a long time to run - -# Minimum Python version -minversion = 3.8 - -# Directory patterns to ignore -norecursedirs = .git .tox dist build *.egg .venv venv \ No newline at end of file diff --git a/scratch-gitincludeforpoc/dont-source-controlled-sensitive-data-nb-outputs.ipynb b/scratch-gitincludeforpoc/dont-source-controlled-sensitive-data-nb-outputs.ipynb new file mode 100644 index 0000000..e3f1850 --- /dev/null +++ b/scratch-gitincludeforpoc/dont-source-controlled-sensitive-data-nb-outputs.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bfc5bbeb-959d-4114-b265-aa6a05e9032f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# nbstripout Test Notebook\n", + "*it seems this is already handled by the setup in git you see the following*\n", + "> Note: This notebook may have outputs, but they are not included because the workspace administrator disabled committing outputs.\n", + "\n", + "This notebook is used to verify that `nbstripout` is correctly removing cell outputs before commits.\n", + "\n", + "The notebook intentionally generates and displays fake “sensitive” data.\n", + "If outputs appear in Git, nbstripout is not working.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8c82689b-7aa9-4f7e-93e7-5afb822e3f9c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Fake sensitive-looking data\n", + "df = pd.DataFrame({\n", + " \"patient_id\": [\"NHS-100001\", \"NHS-100002\", \"NHS-100003\"],\n", + " \"name\": [\"FAKE Patient A Smith\", \"FAKE Patient B Jones\", \"FAKE Patient C Brown\"],\n", + " \"diagnosis\": [\"Diabetes\", \"Cancer\", \"Asthma\"]\n", + "})\n", + "\n", + "print(\"⚠️ FAKE SENSITIVE DATA — SHOULD NOT BE COMMITTED WITH OUTPUTS ⚠️\")\n", + "display(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0a94432c-493b-49aa-9c2c-256a8efc0766", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "dont-source-controlled-sensitive-data-nb-outputs", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}