From 88991824b03005f2850a7fc78416d10ece855229 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Thu, 4 Dec 2025 17:00:28 +0000 Subject: [PATCH 01/14] fixing repo --- .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/workflows/ci.yml | 56 ++++ .github/workflows/dev.yml | 1 + .github/workflows/prod-cd.yml | 28 ++ .github/workflows/staging-cicd.yml | 56 ++++ .gitignore | 47 ++++ README.md | 169 +++++++++++- databricks.yml | 172 ++++++++++++ docs/architecture.md | 0 docs/deployment_guide.md | 0 environment-redundant use toml.yml | 0 fixtures/.gitkeep | 9 + notebooks/.gitinclude | 0 pyproject.toml | 32 +++ requirements.txt | 0 resources/configs/.gitinclude | 0 resources/configs/dev.yml | 13 + resources/configs/prod.yml | 0 resources/configs/staging.yml | 0 .../jobs/create_db_connections.dontneedityml | 14 + resources/pipeline/.gitinclude | 0 resources/pipeline/ods_ingestion.yml | 86 ++++++ resources/setup/unity-catalog-try-later.yml | 46 ++++ scripts/.gitinclude | 0 setup/Readme.md | 1 + setup/setup_catalogs.sql | 40 +++ src/.gitinclude | 0 src/bronze/- Copy.gitinclude | 0 src/dlt_pipeline.py | 1 + src/gold/- Copy.gitinclude | 0 .../CreateDatabaseConnections.dontneedipynb | 126 +++++++++ ...t - original for reference delete later.py | 145 ++++++++++ src/ingestion/ods_ingest.py | 247 ++++++++++++++++++ src/silver/- Copy.gitinclude | 0 src/transformations/.gitinclude | 0 src/utils/.gitinclude | 0 tests/.gitinclude | 0 37 files changed, 1289 insertions(+), 1 deletion(-) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/dev.yml create mode 100644 .github/workflows/prod-cd.yml create mode 100644 .github/workflows/staging-cicd.yml create mode 100644 .gitignore create mode 100644 databricks.yml create mode 100644 docs/architecture.md create mode 100644 docs/deployment_guide.md create mode 100644 environment-redundant use toml.yml create mode 100644 fixtures/.gitkeep create mode 100644 notebooks/.gitinclude create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 resources/configs/.gitinclude create mode 100644 resources/configs/dev.yml create mode 100644 resources/configs/prod.yml create mode 100644 resources/configs/staging.yml create mode 100644 resources/jobs/create_db_connections.dontneedityml create mode 100644 resources/pipeline/.gitinclude create mode 100644 resources/pipeline/ods_ingestion.yml create mode 100644 resources/setup/unity-catalog-try-later.yml create mode 100644 scripts/.gitinclude create mode 100644 setup/Readme.md create mode 100644 setup/setup_catalogs.sql create mode 100644 src/.gitinclude create mode 100644 src/bronze/- Copy.gitinclude create mode 100644 src/dlt_pipeline.py create mode 100644 src/gold/- Copy.gitinclude create mode 100644 src/ingestion/CreateDatabaseConnections.dontneedipynb create mode 100644 src/ingestion/ods_ingest - original for reference delete later.py create mode 100644 src/ingestion/ods_ingest.py create mode 100644 src/silver/- Copy.gitinclude create mode 100644 src/transformations/.gitinclude create mode 100644 src/utils/.gitinclude create mode 100644 tests/.gitinclude diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..bc1ff7f --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1 @@ +# TODO QQQQ \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..63b7205 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing +# such as linting, and unit test for python, maybe dab? verify +# we run these on all pull request because if there is a hot fix it may not have passed through +# staging for example +# qqqq check this is up to date +name: CI - Pull Request Checks + +# Run CI on all pull requests +on: + pull_request: + branches: + - '**' # all branches + +jobs: + ci_checks: + name: "Linting, Unit Tests, DAB Verify" + runs-on: ubuntu-latest + + steps: + # Checkout code + - name: Checkout repository + uses: actions/checkout@v4 + + # Set up Python + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + # Install dependencies used for linting and unit tests + - name: Install dependencies + run: pip install -r requirements-dev.txt + + # Run python unit tests + - name: Run Unit Tests + run: pytest tests/unit + + # Run python lint + # qqqq on example used flake8 instead + # pyproject.toml will need configuring + - name: Run Linting + run: pylint src + + # qqqq to do run commit lint step and put in commit lint config + # see TELBlazor + - name: Commit lint + run: | + echo "Commit lint not implemented" + exit 1 + + # qqqq to do run version generation step and put in commit lint config + # see TELBlazor + - name: Version Generation Test Run + run: | + echo "Version test run not implemented" + exit 1 diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml new file mode 100644 index 0000000..cdb45ea --- /dev/null +++ b/.github/workflows/dev.yml @@ -0,0 +1 @@ +# qqqq i dont think we need a dev pipeline event because people are in their own dev workspace on databricks or a dev branch \ No newline at end of file diff --git a/.github/workflows/prod-cd.yml b/.github/workflows/prod-cd.yml new file mode 100644 index 0000000..b825426 --- /dev/null +++ b/.github/workflows/prod-cd.yml @@ -0,0 +1,28 @@ +#No code quality checks staging has done it + +name: Deploy to Production Databricks Workspace + +on: + push: + branches: + - main + +jobs: + +deploy_prod: + name: "Deploy Bundle to Production Environment" + runs-on: ubuntu-latest + needs: testing + environment: prod + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + # qqqq add version and changelog creation step, and give a dab version matching repo version + - uses: actions/checkout@v4 + - uses: databricks/setup-cli@main + - name: Deploy bundle + run: databricks bundle deploy -t prod --auto-approve + working-directory: . \ No newline at end of file diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml new file mode 100644 index 0000000..f3e2444 --- /dev/null +++ b/.github/workflows/staging-cicd.yml @@ -0,0 +1,56 @@ +# Rely on ci.yml and branch rules to ensure bundle validation and linting +# we are not going from staging straight to prod, because we use staging for manual testing as well +name: Deploy to Databricks Staging and Trigger Tests in Databricks + +on: + push: + branches: + - staging + +jobs: + deploy_to_staging: + name: "Deploy Bundle to Stage Environment" + runs-on: ubuntu-latest + # qqqq need to set up my git env values + environment: staging + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + # qqqq add a step to make a version number so can see its changed and print it as part of the job + #DAB_VERSION: "staging-${GITHUB_SHA::7}" # short commit hash + # databricks bundle deploy --environment staging --version $DAB_VERSION + + + - uses: actions/checkout@v4 + - uses: databricks/setup-cli@main + - name: Deploy bundle + # trigger target the staging deploy in databricks.yml + run: databricks bundle deploy -t staging --auto-approve + working-directory: . + + testing: + name: "Integration Testing" + runs-on: ubuntu-latest + needs: deploy_to_staging + environment: staging + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + id: cache + name: Cache Python build + with: + python-version: "3.x" + cache-dependency-path: "requirements-dev.txt" + cache: "pip" + - name: Install dependencies + run: pip install -r requirements-dev.txt + - name: Integration Testing + run: pytest tests/integration \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..840b1da --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# Couldnt find an official gitignore this is AI generated +# ----------------------------- +# Databricks / DAB / dbx +# ----------------------------- +.databricks/ # local workspace metadata / CLI files +.deploy/ # local deploy cache (dbx/DAB) +.bundle/ # local bundle files (dbx/DAB) +*.log # temporary logs +*.tmp # temporary files +dbx_project.yaml.bak # backup of bundle config +build/ +dist/ + +# ----------------------------- +# Python +# ----------------------------- +__pycache__/ +*.pyc +*.pyo +*.pyd +*.egg-info/ +.venv/ +env/ +pip-selfcheck.json + +# ----------------------------- +# Jupyter Notebooks +# ----------------------------- +.ipynb_checkpoints/ + +# ----------------------------- +# Scratch / experimental folder +# ----------------------------- +scratch/** # ignore all files in scratch +!scratch/README.md # except placeholder README.md + +# ----------------------------- +# IDE / editor +# ----------------------------- +.vscode/ +.idea/ + +# ----------------------------- +# OS / system +# ----------------------------- +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/README.md b/README.md index b9915b3..3f2f561 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,169 @@ # DatabricksPOC -Experimenting with databricks workflow +Experimenting with databricks workflow specifically DABS + +# Notable deviations from how final version will work +- Databricks instance per environment + - target the host instance not catalogs + - do not need root to be specified + - will replace yml hardcoding with variables + +# Branching +- Dev is like local so maybe PR and deployment to own space should not be gated + - so lint only + - auto merge + - unit tests + - no prs +- staging + - require PR + - run tests after merge +- main/prod + - rerun staging tests + - integration tests + - maybe some review but unlikley unless very motivated for checking for DRY early on +- dev is per user and there is a preference for working within databricks, so actually staging and main only seems the best approach. And versioning can be done via a commit tracker on pushing main + +- why folders named TD-xxx + - if its wip then in scratch? or gitignore scratch +- why not on own branches +- we need readme of structure with what is used for what + + +# Refs +- [Official up to step 5](https://docs.databricks.com/aws/en/ldp/convert-to-dab) +- [follow this too](https://www.evanazevedo.com/blog/databricks-deployment/) + - in this auto staging to prod which we dont want + - dev is shared not user space which we dont want + - our approach local development will be deploying via client in the ui for dev, but staging and prod is deployed by github actions instead +- also of use [multiple project](https://github.com/datakickstart/datakickstart_dabs/tree/main) +- [another with loads of examples to drill down to](https://github.com/databricks/bundle-examples) + +# Potential Structure +*Should generate one after* +[Confluence structure to compare to mine](https://hee-tis.atlassian.net/wiki/spaces/TP/pages/5201494023/GitHub+Structure) +project-root/ +│ +├── README.md +├── databricks.yml # Asset bundle config +├── notebooks/ # For exploratory work & polished pipelines +│ ├── dev/ # Analysts' playground +│ ├── pipelines/ # Production-ready notebooks +│ └── utils/ # Shared utility notebooks +│ +├── src/ # Core functions and transformations +│ ├── bronze/ +│ ├── silver/ +│ ├── gold/ +│ └── common/ # Reusable code (UDFs, helpers) +│ +├── tests/ +│ ├── unit/ +│ ├── integration/ +│ └── data_quality/ +│ +├── configs/ +│ ├── dev.yml +│ ├── staging.yml +│ └── prod.yml +│ +├── pipelines/ # Declarative pipeline definitions +│ ├── bronze_pipeline.py +│ ├── silver_pipeline.py +│ └── gold_pipeline.py +│ +├── requirements.txt # Python dependencies +├── environment.yml # Conda environment for analysts +└── scripts/ # Utility scripts (deploy, tests) + +# Notes on Structure + + +| 1st Level | 2nd Level | Notes | +|---------------|-----------------------|-------| +| README.md | — | | +| databricks.yml| — | | +| notebooks | dev | | +| notebooks | pipelines | | +| notebooks | utils | | +| src | bronze | | +| src | silver | | +| src | gold | | +| src | common | | +| tests | unit | | +| tests | integration | | +| tests | data_quality | | +| configs | dev.yml | | +| configs | staging.yml | | +| configs | prod.yml | | +| pipelines | bronze_pipeline.py | | +| pipelines | silver_pipeline.py | | +| pipelines | gold_pipeline.py | | +| requirements.txt | — | | +| environment.yml | — | | +| scripts | — | | + + +# Template read me + +# Workflow_POC + +The 'Workflow_POC' project was generated by using the default template. + +* `src/`: Python source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests for the shared Python code. +* `fixtures/`: Fixtures for data sets (primarily used for testing). + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 0000000..0134cca --- /dev/null +++ b/databricks.yml @@ -0,0 +1,172 @@ + +######################################## +#### POC NOTES DELETE LATER ############ +######################################## +## Would have prefered use databrickscfg to hold host and sp names using custom values +## it can set profiles for built in values but not custom ones it seems +## # qqqq secrets C:\Users\NM2.W9215KB2\.databrickscfg will need for git too +## # seems doesnt do cutom ones want to do +## # [staging] +## #staging_host = -> in databricks yml host: ${workspace.staging_host} +## #staging_env_sp_name = +## #staging_storage_name = +####################################### +#### Differences expect for final varsion +###################################### +## storage account per environment +## different host for different databircks work spaces +## will load wheels via artifacts when we have them +## permission groups needed for admin of staging prod etc +####################################### +#### databrickscfg example with custom values didnt work we will only set the token in this file i think +######################################### +# [dev] +# host = https://adb-295718430158257.17.azuredatabricks.net +# token = dapi************************ +# dev_env_sp_name = b0326488-f6b0-4c0f-bf20-a091224bad83 +# dev_storage_name = unifiedrptdeltalake +# test_var = foundatestvar +######################################## +## Catalogs need to be unique because of the databricks metastore +## We will set the catalogs to only be accessible from their corresponding workspace +## that workspace will be linked to it own storage +## dev_catalogs will have unique schema names per user prefixed with their username so that there is seperation for experimentation +## staging_catalog and prod_catalog will have shared schema names +## schema names will not be defined here, a recommended pattern is to have them mirror layer names bronze, silver ... transformations, followed by their "domain" (may not be the right word) e.g. ods +## personal dabs should use personal auth but allow dev_user group view and dev_env_sp to manage it +## prod staging dab permissions should be via gitactions on pull request using service priniciple +## in future we should review permission on staging and prod as should be least privilege and done by service principles unless there is an issue +## we should have no user name based permissions +####################################### +## ref [Evanaze example repo](https://github.com/evanaze/dbx-asset-bundle-deployment) +######################################## +#### END POC NOTES DELETE LATER ########## +######################################## + +###################################### +## Databricks Asset Bundle Definition +###################################### +## ... + +###################################### + +bundle: + name: DatabricksPOC + uuid: ba682f8a-5d14-4ae7-a770-d6a359c0e835 + +# Importing resources. These are split into their own files for modularity +# qqqq comback may be a key place for dry +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +# Then made available to python scriplts via spark.conf.get("bundle.") once exposed by the yml running that file +variables: + catalog: + description: Envirnoment specific catalog name + # schema: + # default: qqqq-deleted-later + # description: Actually setting this with vars but included due to an error + schema_prefix: + default: "" # to be used in dev only so staging prod have no prefix value + description: To prefix dev user schema names to allow multiple dev users to use same catalog with their own different schema names for development + env_name: + description: The environment name (dev, staging, prod) + storage_account: + description: Seperate databricks workspaces dedicated storage (dev, staging, prod) + # Storage principles id is used for name field in databricks.yml name the sp variable: _id + dev_env_sp_id: + default: "b0326488-f6b0-4c0f-bf20-a091224bad83" + staging_env_sp_id: + default: "my-sp-id-jfsdkjhfjsdhfkjh" + prod_env_sp_id: + default: "my-sp-id-jfsdkjhfjsdhfkjh" + +# qqqq will want later if many python files +# artifacts: + # python_artifact: + # type: whl + # build: uv build --wheel + +# Deployment environments +targets: + dev: + # qqqq what is the process of developing and deploying dabs for own area. + # qqqq would it actually be staging with features against folders !!!!??? + + mode: development + # Deploy under own user not service principle + default: true + # This is the the default deployment workspace + workspace: + # profile means in gets set values from [dev] section of databrickscfg + profile: dev + # specify hosts in the databricks.yml not config for clarity + # dev databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + # Dev root path under each users home directory + root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev + variables: + env_name: dev + catalog: dev_catalog + # only dev has schema prefix as its per user + #e.g. phil_ will become phil_bronze_ods if we do layer and domain? as our schema naming convention + schema_prefix: ${workspace.current_user.short_name}_ + # dev storage account + storage_account: unifiedrptdeltalake + permissions: + - level: CAN_MANAGE + service_principal_name: "${var.dev_env_sp_id}" + - level: CAN_MANAGE + # Devs manage our own stuff + user_name: ${workspace.current_user.userName} + - level: CAN_VIEW + # Devs can see each others stuff + group_name: dev_env_users + + + + staging: + # Staging should purely be for investigation and testing prior to going to prod it runs in production mode so will run and constant updates and should be through github actions and service principle on successful pull request + mode: production + workspace: + profile: staging + # staging databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/staging + variables: + env_name: staging + catalog: staging_catalog + # Staging storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: staging_env_users + level: CAN_VIEW + - service_principal_name: "${var.staging_env_sp_id}" + level: CAN_MANAGE + + + prod: + # Automatically deployed to via git actions and service principle minimal testing on deploy as previously run on staging + mode: production + workspace: + profile: prod + # prod databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/prod + variables: + # when 3 databricks it will be a share catalog name across the databricks + env_name: prod + catalog: prod_catalog + # Prod storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: prod_env_users + level: CAN_VIEW + - service_principal_name: "${var.prod_env_sp_id}" + level: CAN_MANAGE diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/deployment_guide.md b/docs/deployment_guide.md new file mode 100644 index 0000000..e69de29 diff --git a/environment-redundant use toml.yml b/environment-redundant use toml.yml new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 0000000..6cabba3 --- /dev/null +++ b/fixtures/.gitkeep @@ -0,0 +1,9 @@ +# Test fixtures directory + +Add JSON or CSV files here. In tests, use them with `load_fixture()`: + +``` +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 +``` diff --git a/notebooks/.gitinclude b/notebooks/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1707b8a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[project] +name = "Workflow_POC" +version = "0.0.1" +authors = [{ name = "philip.tate@nhs.net" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", +] + +[project.scripts] +main = "Workflow_POC.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.black] +line-length = 125 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/.gitinclude b/resources/configs/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/dev.yml b/resources/configs/dev.yml new file mode 100644 index 0000000..fd3aaa4 --- /dev/null +++ b/resources/configs/dev.yml @@ -0,0 +1,13 @@ +# TODO and gitignore it and only need the one relevant to my databricks workspace +# something like + +# vars: + # # Unity Catalog + # catalog: dev_catalog # temporary dev catalog name + # schema: unified_reporting_dev # schema for bronze/silver/gold + + # # Root path for data (can be DBFS or ADLS mount) + # data_root: /mnt/dev/unified_reporting + + # # Mode flag for your code + # mode: development \ No newline at end of file diff --git a/resources/configs/prod.yml b/resources/configs/prod.yml new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/staging.yml b/resources/configs/staging.yml new file mode 100644 index 0000000..e69de29 diff --git a/resources/jobs/create_db_connections.dontneedityml b/resources/jobs/create_db_connections.dontneedityml new file mode 100644 index 0000000..8832c4e --- /dev/null +++ b/resources/jobs/create_db_connections.dontneedityml @@ -0,0 +1,14 @@ +resources: + jobs: + CreateDatabaseConnections_job: + name: CreateDatabaseConnections_job + description: "Sets up database connections and foreign catalogs required for pipelines" + email_notifications: + on_failure: + - philip.tate@nhs.net + tasks: + - task_key: create_db_connections + notebook_task: + notebook_path: ingestion/CreateDatabaseConnections.ipynb + base_parameters: + environment: ${var.env_name} diff --git a/resources/pipeline/.gitinclude b/resources/pipeline/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml new file mode 100644 index 0000000..2685775 --- /dev/null +++ b/resources/pipeline/ods_ingestion.yml @@ -0,0 +1,86 @@ +############################### +## POC notes - DELETE LATER +############################### +## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense +## We will not define schemas here +## We use this file to expose from databricks.yml the variables we need to set up the pipeline +## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here +############################### +## If we want specific pipeline resource file per .py file we should use this i think + # libraries: + # - notebook: + # path: ../../src/ingestion/ods_ingest.py +## if we want per layer maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/**.py +## if we want per domain maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ods_*.py +############################### + +# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach +##### +# If we are running multlipe pipelines we may define all their vars at the top +##### + + +# qqqq +## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml +## bundle for vars originating from databricks.ymly +### i get vars from databricks +## pipeline. from pipeline files +## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from + +## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but +# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. +# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. +variables: + layer: + default: bronze + description: bronze, silver, transfrormations etc + + +x-bronze-config: &bronze-config + bundle.env_name: ${var.env_name} + bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml + pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var + # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" + pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" + +resources: + pipelines: + pipeline_ods_ingestion: + name: ods_ingestion + libraries: + - glob: + # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ - might work + # include: ../../src/ingestion/*.py - doesnt work + include: ../../src/ingestion/ods_ingest.py + photon: true + # qqqq good practice to specify its something to do with dlt having beta version? + channel: current + # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table + catalog: ${var.catalog} + target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here + serverless: true + # qqqq dont think i need this here DELETE root_path: ../../src/ingestion + # qqqq config is only at pipeline level use yml anchor points if need to reuse + configuration: + ################ Map Databricks Bundle variables to Spark Config Properties ################ + # Map the Bundle variables (from databricks.yml) to Spark config properties + # The key names here MUST match what you use in spark.conf.get() in Python! + # bundle.env_name: ${var.env_name} + # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml + # bundle.storage_account: ${var.storage_account} + ############### Resource yml files for set of pipelines ################# + # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here + # for example this would be + # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") + # configuration: + <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs + pipeline.domain: ods # if we then want to apply per pipeline variable here \ No newline at end of file diff --git a/resources/setup/unity-catalog-try-later.yml b/resources/setup/unity-catalog-try-later.yml new file mode 100644 index 0000000..5737fd5 --- /dev/null +++ b/resources/setup/unity-catalog-try-later.yml @@ -0,0 +1,46 @@ + +# rather than recording manual run scripts in the the setup top level folder we could check and run like this potentially +# this quick was just a suggested improvement and has not been checked just put here for later +# qqqq + + +# # resources/unity-catalog.yml +# resources: +# # --- 1. CATALOGS --- +# unity_catalog_catalogs: +# # Defining the catalog for the DEV environment +# our_catalog: +# name: ${var.catalog} # Will resolve to 'our_catalog' in dev target +# # You can optionally define the managed location here if you want to hardcode it, +# # but often this is best managed by the workspace admin. +# # managed_location: 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/our_catalog' +# comment: 'Dev environment - personal schemas per developer' + +# # The staging and prod targets will use their respective catalog names (staging_catalog/prod_catalog) +# # They can reference the same general structure here, or be defined separately if they have different configs. +# staging_catalog: +# name: staging_catalog +# comment: 'Staging environment - integration testing' + +# prod_catalog: +# name: prod_catalog +# comment: 'Production environment - live data' + +# # --- 2. SCHEMAS (for Staging and Prod) --- +# unity_catalog_schemas: +# staging_schema: +# # Use explicit names for Staging/Prod as they are shared/static +# schema: staging_catalog.our_schema +# catalog: staging_catalog +# comment: 'Shared schema for Staging data' + +# prod_schema: +# schema: prod_catalog.our_schema +# catalog: prod_catalog +# comment: 'Shared schema for Production data' + +# # NOTE on DEV Schema: +# # You DO NOT need to explicitly define the DEV schema here (e.g., our_catalog.${workspace.current_user.short_name}). +# # DLT pipelines, when configured with `schema: ${var.schema}` and `catalog: ${var.catalog}`, +# # will automatically create the *schema* (database) if it doesn't exist, as long as the +# # user/service principal has the necessary CREATE SCHEMA permission on the target catalog. \ No newline at end of file diff --git a/scripts/.gitinclude b/scripts/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/setup/Readme.md b/setup/Readme.md new file mode 100644 index 0000000..b62226e --- /dev/null +++ b/setup/Readme.md @@ -0,0 +1 @@ +Manually run files, but kept here for a history or for future setup \ No newline at end of file diff --git a/setup/setup_catalogs.sql b/setup/setup_catalogs.sql new file mode 100644 index 0000000..d44795d --- /dev/null +++ b/setup/setup_catalogs.sql @@ -0,0 +1,40 @@ +-- ============================================================================ +-- ONE-TIME SETUP: Create catalogs and schemas +-- ============================================================================ +-- Run this in Databricks SQL Editor when setting up a new environment +-- +-- POC Setup (current): +-- - Creates 3 separate catalogs in single Databricks instance +-- - our_catalog: Dev work with personal schemas per developer +-- - staging_catalog: Staging with shared schema +-- - prod_catalog: Production with shared schema +-- +-- Production Setup (future with 3 Databricks instances): +-- - Create same catalog name on each instance +-- - Separation by different hosts, not catalog names +-- ============================================================================ + + +-- qqqq Careful this applied it to all databricks +-- there is this +-- ISOLATION MODE ISOLATED -- this line is the magic + -- COMMENT 'POC-only – never visible outside this workspace'; + +-- Create catalogs using managed storage (Databricks handles storage automatically) +CREATE CATALOG IF NOT EXISTS our_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/our_catalog' +COMMENT 'Dev environment - personal schemas per developer'; + +CREATE CATALOG IF NOT EXISTS staging_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/staging_catalog' +COMMENT 'Staging environment - integration testing'; + +CREATE CATALOG IF NOT EXISTS prod_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/prod_catalog' +COMMENT 'Production environment - live data'; + +-- Create shared schemas for staging and prod +-- Dev schemas are created automatically per-user by the bundle deployment +-- (via schema: ${workspace.current_user.short_name} in databricks.yml) +CREATE SCHEMA IF NOT EXISTS staging_catalog.our_schema; +CREATE SCHEMA IF NOT EXISTS prod_catalog.our_schema; \ No newline at end of file diff --git a/src/.gitinclude b/src/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/bronze/- Copy.gitinclude b/src/bronze/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/dlt_pipeline.py b/src/dlt_pipeline.py new file mode 100644 index 0000000..93802c1 --- /dev/null +++ b/src/dlt_pipeline.py @@ -0,0 +1 @@ +# entry point DLT pipelines \ No newline at end of file diff --git a/src/gold/- Copy.gitinclude b/src/gold/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/ingestion/CreateDatabaseConnections.dontneedipynb b/src/ingestion/CreateDatabaseConnections.dontneedipynb new file mode 100644 index 0000000..d9142a7 --- /dev/null +++ b/src/ingestion/CreateDatabaseConnections.dontneedipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a98c9296-e636-4ff7-a2ea-3e49feacf9c2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "moodlelearninghub" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d76cd8c9-2608-4449-b0d0-3450fcc2910b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "set scope = {{:scope}}\n", + "\n", + "-- CREATE CONNECTION IF NOT EXISTS moodlelearninghub\n", + "CREATE CONNECTION IF NOT EXISTS moodlelearninghub\n", + " TYPE SQLSERVER\n", + " OPTIONS (\n", + " host secret(scope, 'LearningHubMoodle.host'),\n", + " port secret(scope, 'LearningHubMoodle.port'),\n", + " user secret(scope, 'LearningHubMoodle.username'),\n", + " password secret(scope, 'LearningHubMoodle.password')\n", + " );\n", + "\n", + "-- CREATE CATALOG moodle_learning_hub_catalogue\n", + "CREATE FOREIGN CATALOG IF NOT EXISTS moodle_learning_hub_catalogue USING CONNECTION moodlelearninghub\n", + " OPTIONS (\n", + " database secret(scope, 'LearningHubMoodle.database')\n", + " );" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7ff9f1ef-8c02-485b-af94-ae74eda44a9b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "learninghub" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6826dd70-4a3a-4c92-9b70-e43dbca12313", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "-- CREATE CONNECTION IF NOT EXISTS learninghub\n", + "CREATE CONNECTION IF NOT EXISTS learninghub\n", + " TYPE SQLSERVER\n", + " OPTIONS (\n", + " host secret(scope, 'LearningHub.host'),\n", + " port secret(scope, 'LearningHub.port'),\n", + " user secret(scope, 'LearningHub.username'),\n", + " password secret(scope, 'LearningHub.password')\n", + " ); \n", + "\n", + "-- CREATE CATALOG learninghub_catalog\n", + "CREATE FOREIGN CATALOG IF NOT EXISTS learninghub_catalog USING CONNECTION learninghub\n", + " OPTIONS (\n", + " database secret(scope, 'LearningHub.database')\n", + " );" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "CreateDatabaseConnections", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/src/ingestion/ods_ingest - original for reference delete later.py b/src/ingestion/ods_ingest - original for reference delete later.py new file mode 100644 index 0000000..c7bf5d1 --- /dev/null +++ b/src/ingestion/ods_ingest - original for reference delete later.py @@ -0,0 +1,145 @@ +# from pyspark import pipelines as dp + +# container = 'bronze' +# storage_account = 'unifiedrptdeltalake' +# path_to_file_or_folder = 'ods' + +# @dp.table( +# name="bronze_ods.Additional_Attributes_Details", +# comment="Import raw Additional_Attributes_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Additional_Attributes_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Code_System_Details", +# comment="Import raw Code_System_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Code_System_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Contact_Details", +# comment="Import raw Contact_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Contact_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Manifest_Details", +# comment="Import raw Manifest_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Manifest_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Organisation_Details", +# comment="Import raw Organisation_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Organisation_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.OtherID_Details", +# comment="Import raw OtherID_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/OtherID_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.PrimaryRole_Details", +# comment="Import raw PrimaryRole_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/PrimaryRole_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Relationship_Details", +# comment="Import raw Relationship_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Relationship_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Role_Details", +# comment="Import raw Role_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Role_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Successor_Details", +# comment="Import raw Successor_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Successor_Details.csv" +# ) +# ) \ No newline at end of file diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py new file mode 100644 index 0000000..9812f39 --- /dev/null +++ b/src/ingestion/ods_ingest.py @@ -0,0 +1,247 @@ +# for making spark tables +# qqqq problem i am having is that we are setting the schema, and dev has schema set as user names +# i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script +from pyspark import pipelines as dp + + +# Fixed System Constants these and some of this stuff should be going in a helper i think +#ADLS_PROTOCOL = "abfss://" +#ADLS_SUFFIX = ".dfs.core.windows.net" + +# 1. Get the Catalog name +# qqqq i dont think i want a default id prefer an error i think +# if set this in pipeline yml we wont need it +#catalog_name = spark.conf.get("bundle.catalog") + +# 2. Get the Schema Prefix (This is what changes between environments) +# In Dev, this will be the username. In Staging/Prod, it will be blank. +#schema_user_prefix = spark.conf.get("bundle.schema_prefix") +# this will often be a medallion layer but in src we also have transformations and ingestion so i think this mirror folders in source would be logical if team agrees qqqq +#schema_layer = "bronze_" +#schema_domain = "ods" #qqqq check what terminiology we want here + +# Construct the final schema name +#schema_name = (schema_user_prefix + schema_layer + schema_domain) +#print(schema_name) +# The container likely should mirror the layer name? +# container_layer ?? qqqq +#container = spark.conf.get("bundle.layer") # layer is bronze silver etc +# This likely should be dev staging prod +# storage_environment ?? qqqq +# wouldnt have default +# storage_account = spark.conf.get("bundle.storage_account") # 'unifiedrptdeltalake' +storage_container_path = spark.conf.get("pipeline.storage_container_path") +# In our storage our folders maybe should be domain based and if we thing this is manageable as hard rule this variable could be called domain_folder or similar qqqq +# domain_folder ?? qqqq +folder_name = spark.conf.get("pipeline.domain") # ods +#folder_location_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{folder_name}/" +folder_location_path = f"{storage_container_path }/{folder_name}/" +# "abfss://bronze@unifiedrptdeltalake.dfs.core.windows.net/ods +print(folder_location_path) + + +@dp.table( + # qqqq was f"{schema_name}.Additional_Attributes_Details" but worked before now need to do it this way???!!! + name="Additional_Attributes_Details", + comment="Import raw Additional_Attributes_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Additional_Attributes_Details.csv" + ) + ) + +@dp.table( + name="Code_System_Details", + comment="Import raw Code_System_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Code_System_Details.csv" + ) + ) + +@dp.table( + name="Contact_Details", + comment="Import raw Contact_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Contact_Details.csv" + ) + ) + +@dp.table( + name="Manifest_Details", + comment="Import raw Manifest_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Manifest_Details.csv" + ) + ) + +@dp.table( + name="Organisation_Details", + comment="Import raw Organisation_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Organisation_Details.csv" + ) + ) + +@dp.table( + name="OtherID_Details", + comment="Import raw OtherID_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}OtherID_Details.csv" + ) + ) + +@dp.table( + name="PrimaryRole_Details", + comment="Import raw PrimaryRole_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}PrimaryRole_Details.csv" + ) + ) + +@dp.table( + name="Relationship_Details", + comment="Import raw Relationship_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Relationship_Details.csv" + ) + ) + +@dp.table( + name="Role_Details", + comment="Import raw Role_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Role_Details.csv" + ) + ) + +@dp.table( + name="Successor_Details", + comment="Import raw Successor_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Successor_Details.csv" + ) + ) + + +###### Try this +# import dlt +# from pyspark.sql import SparkSession + +# # ============================================================================ +# # Configuration from Bundle +# # ============================================================================ +# storage_account = spark.conf.get("bundle.storage_account") +# layer = spark.conf.get("bundle.layer") # "bronze" +# domain = spark.conf.get("bundle.domain") # "ods" + +# # ============================================================================ +# # Constants +# # ============================================================================ +# ADLS_PROTOCOL = "abfss://" +# ADLS_SUFFIX = ".dfs.core.windows.net" + +# # ============================================================================ +# # Derived Paths +# # ============================================================================ +# container = layer # Container matches layer +# folder_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{domain}/" + +# # ============================================================================ +# # Data Model - ODS Tables +# # ============================================================================ +# ODS_TABLES = [ +# ("Additional_Attributes_Details", "Import raw Additional_Attributes_Details"), +# ("Code_System_Details", "Import raw Code_System_Details"), +# ("Contact_Details", "Import raw Contact_Details"), +# ("Manifest_Details", "Import raw Manifest_Details"), +# ("Organisation_Details", "Import raw Organisation_Details"), +# ("OtherID_Details", "Import raw OtherID_Details"), +# ("PrimaryRole_Details", "Import raw PrimaryRole_Details"), +# ("Relationship_Details", "Import raw Relationship_Details"), +# ("Role_Details", "Import raw Role_Details"), +# ("Successor_Details", "Import raw Successor_Details"), +# ] + +# # ============================================================================ +# # Helper Functions <----- this is what would go in a wheel probably +# # ============================================================================ +# def load_csv(filename: str): +# """Load CSV from Azure storage with standard options""" +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load(f"{folder_path}{filename}.csv") +# ) + +# # ============================================================================ +# # Create DLT Tables +# # ============================================================================ +# for table_name, comment in ODS_TABLES: +# # Closure to capture loop variables correctly +# def create_table(name=table_name, desc=comment): +# @dlt.table(name=name, comment=desc) +# def table_loader(): +# return load_csv(name) +# return table_loader + +# create_table() \ No newline at end of file diff --git a/src/silver/- Copy.gitinclude b/src/silver/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/transformations/.gitinclude b/src/transformations/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/.gitinclude b/src/utils/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/.gitinclude b/tests/.gitinclude new file mode 100644 index 0000000..e69de29 From f66614acc22977714cf04cc217e42830ce4d0ba3 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 13:04:22 +0000 Subject: [PATCH 02/14] fiddling trying dbx setup --- .github/workflows/dev.yml | 17 +- .github/workflows/staging-cicd.yml | 110 ++++----- databricks.yml | 344 ++++++++++++++--------------- devops/user dev dab deploy.ipynb | 57 +++++ 4 files changed, 300 insertions(+), 228 deletions(-) create mode 100644 devops/user dev dab deploy.ipynb diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index cdb45ea..6a66a04 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -1 +1,16 @@ -# qqqq i dont think we need a dev pipeline event because people are in their own dev workspace on databricks or a dev branch \ No newline at end of file +# qqqq move notes somewhere useful +# qqqq i dont think we need a dev pipeline event because people are in their own dev workspace on databricks or a dev branch +# - but lets give it a go as a place for just data devs to update their stuff together maybe wont need it +# - think repeat any testing want to know early anyway?? +# - dont worry about repeating stepts between cicd for now keep simple for branch rules etc do it later if desired +# when are we deploying local dabs though? on push to feature branches???? +# -- linking it to push might be nice deploying dabs could be forgetable step for local development might be nice to auto deploy dab on push?? +name: Deploy to Databricks Dev Shared + +on: + push: + branches: + - dev-data-team-shared + +## Test, DAB, Tell dev repo folder on databricks to pull (so in sync with its dab ... or should it deploy dab but apparently git folders are for reference) +jobs: diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index f3e2444..7b0c524 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -1,56 +1,56 @@ -# Rely on ci.yml and branch rules to ensure bundle validation and linting -# we are not going from staging straight to prod, because we use staging for manual testing as well -name: Deploy to Databricks Staging and Trigger Tests in Databricks - -on: - push: - branches: - - staging - -jobs: - deploy_to_staging: - name: "Deploy Bundle to Stage Environment" - runs-on: ubuntu-latest - # qqqq need to set up my git env values - environment: staging - env: - DATABRICKS_HOST: ${{ vars.DBX_HOST }} - DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} - DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} - - steps: - # qqqq add a step to make a version number so can see its changed and print it as part of the job - #DAB_VERSION: "staging-${GITHUB_SHA::7}" # short commit hash - # databricks bundle deploy --environment staging --version $DAB_VERSION - - - - uses: actions/checkout@v4 - - uses: databricks/setup-cli@main - - name: Deploy bundle - # trigger target the staging deploy in databricks.yml - run: databricks bundle deploy -t staging --auto-approve - working-directory: . - - testing: - name: "Integration Testing" - runs-on: ubuntu-latest - needs: deploy_to_staging - environment: staging - env: - DATABRICKS_HOST: ${{ vars.DBX_HOST }} - DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} - DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} - - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - id: cache - name: Cache Python build - with: - python-version: "3.x" - cache-dependency-path: "requirements-dev.txt" - cache: "pip" - - name: Install dependencies - run: pip install -r requirements-dev.txt - - name: Integration Testing +# Rely on ci.yml and branch rules to ensure bundle validation and linting +# we are not going from staging straight to prod, because we use staging for manual testing as well +name: Deploy to Databricks Staging and Trigger Tests in Databricks + +on: + push: + branches: + - staging + +jobs: + deploy_to_staging: + name: "Deploy Bundle to Stage Environment" + runs-on: ubuntu-latest + # qqqq need to set up my git env values + environment: staging + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + # qqqq add a step to make a version number so can see its changed and print it as part of the job + #DAB_VERSION: "staging-${GITHUB_SHA::7}" # short commit hash + # databricks bundle deploy --environment staging --version $DAB_VERSION + + + - uses: actions/checkout@v4 + - uses: databricks/setup-cli@main + - name: Deploy bundle + # trigger target the staging deploy in databricks.yml + run: databricks bundle deploy -t staging --auto-approve + working-directory: . + + testing: + name: "Integration Testing" + runs-on: ubuntu-latest + needs: deploy_to_staging + environment: staging + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + id: cache + name: Cache Python build + with: + python-version: "3.x" + cache-dependency-path: "requirements-dev.txt" + cache: "pip" + - name: Install dependencies + run: pip install -r requirements-dev.txt + - name: Integration Testing run: pytest tests/integration \ No newline at end of file diff --git a/databricks.yml b/databricks.yml index 0134cca..450f278 100644 --- a/databricks.yml +++ b/databricks.yml @@ -1,172 +1,172 @@ - -######################################## -#### POC NOTES DELETE LATER ############ -######################################## -## Would have prefered use databrickscfg to hold host and sp names using custom values -## it can set profiles for built in values but not custom ones it seems -## # qqqq secrets C:\Users\NM2.W9215KB2\.databrickscfg will need for git too -## # seems doesnt do cutom ones want to do -## # [staging] -## #staging_host = -> in databricks yml host: ${workspace.staging_host} -## #staging_env_sp_name = -## #staging_storage_name = -####################################### -#### Differences expect for final varsion -###################################### -## storage account per environment -## different host for different databircks work spaces -## will load wheels via artifacts when we have them -## permission groups needed for admin of staging prod etc -####################################### -#### databrickscfg example with custom values didnt work we will only set the token in this file i think -######################################### -# [dev] -# host = https://adb-295718430158257.17.azuredatabricks.net -# token = dapi************************ -# dev_env_sp_name = b0326488-f6b0-4c0f-bf20-a091224bad83 -# dev_storage_name = unifiedrptdeltalake -# test_var = foundatestvar -######################################## -## Catalogs need to be unique because of the databricks metastore -## We will set the catalogs to only be accessible from their corresponding workspace -## that workspace will be linked to it own storage -## dev_catalogs will have unique schema names per user prefixed with their username so that there is seperation for experimentation -## staging_catalog and prod_catalog will have shared schema names -## schema names will not be defined here, a recommended pattern is to have them mirror layer names bronze, silver ... transformations, followed by their "domain" (may not be the right word) e.g. ods -## personal dabs should use personal auth but allow dev_user group view and dev_env_sp to manage it -## prod staging dab permissions should be via gitactions on pull request using service priniciple -## in future we should review permission on staging and prod as should be least privilege and done by service principles unless there is an issue -## we should have no user name based permissions -####################################### -## ref [Evanaze example repo](https://github.com/evanaze/dbx-asset-bundle-deployment) -######################################## -#### END POC NOTES DELETE LATER ########## -######################################## - -###################################### -## Databricks Asset Bundle Definition -###################################### -## ... - -###################################### - -bundle: - name: DatabricksPOC - uuid: ba682f8a-5d14-4ae7-a770-d6a359c0e835 - -# Importing resources. These are split into their own files for modularity -# qqqq comback may be a key place for dry -include: - - resources/*.yml - - resources/*/*.yml - -# Variable declarations. These variables are assigned in the dev/prod targets below. -# Then made available to python scriplts via spark.conf.get("bundle.") once exposed by the yml running that file -variables: - catalog: - description: Envirnoment specific catalog name - # schema: - # default: qqqq-deleted-later - # description: Actually setting this with vars but included due to an error - schema_prefix: - default: "" # to be used in dev only so staging prod have no prefix value - description: To prefix dev user schema names to allow multiple dev users to use same catalog with their own different schema names for development - env_name: - description: The environment name (dev, staging, prod) - storage_account: - description: Seperate databricks workspaces dedicated storage (dev, staging, prod) - # Storage principles id is used for name field in databricks.yml name the sp variable: _id - dev_env_sp_id: - default: "b0326488-f6b0-4c0f-bf20-a091224bad83" - staging_env_sp_id: - default: "my-sp-id-jfsdkjhfjsdhfkjh" - prod_env_sp_id: - default: "my-sp-id-jfsdkjhfjsdhfkjh" - -# qqqq will want later if many python files -# artifacts: - # python_artifact: - # type: whl - # build: uv build --wheel - -# Deployment environments -targets: - dev: - # qqqq what is the process of developing and deploying dabs for own area. - # qqqq would it actually be staging with features against folders !!!!??? - - mode: development - # Deploy under own user not service principle - default: true - # This is the the default deployment workspace - workspace: - # profile means in gets set values from [dev] section of databrickscfg - profile: dev - # specify hosts in the databricks.yml not config for clarity - # dev databricks host - host: https://adb-295718430158257.17.azuredatabricks.net - # Dev root path under each users home directory - root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev - variables: - env_name: dev - catalog: dev_catalog - # only dev has schema prefix as its per user - #e.g. phil_ will become phil_bronze_ods if we do layer and domain? as our schema naming convention - schema_prefix: ${workspace.current_user.short_name}_ - # dev storage account - storage_account: unifiedrptdeltalake - permissions: - - level: CAN_MANAGE - service_principal_name: "${var.dev_env_sp_id}" - - level: CAN_MANAGE - # Devs manage our own stuff - user_name: ${workspace.current_user.userName} - - level: CAN_VIEW - # Devs can see each others stuff - group_name: dev_env_users - - - - staging: - # Staging should purely be for investigation and testing prior to going to prod it runs in production mode so will run and constant updates and should be through github actions and service principle on successful pull request - mode: production - workspace: - profile: staging - # staging databricks host - host: https://adb-295718430158257.17.azuredatabricks.net - root_path: /Workspace/.bundle/${bundle.name}/staging - variables: - env_name: staging - catalog: staging_catalog - # Staging storage account - storage_account: unifiedrptdeltalake - permissions: - - user_name: philip.tate@nhs.net - level: CAN_MANAGE - - group_name: staging_env_users - level: CAN_VIEW - - service_principal_name: "${var.staging_env_sp_id}" - level: CAN_MANAGE - - - prod: - # Automatically deployed to via git actions and service principle minimal testing on deploy as previously run on staging - mode: production - workspace: - profile: prod - # prod databricks host - host: https://adb-295718430158257.17.azuredatabricks.net - root_path: /Workspace/.bundle/${bundle.name}/prod - variables: - # when 3 databricks it will be a share catalog name across the databricks - env_name: prod - catalog: prod_catalog - # Prod storage account - storage_account: unifiedrptdeltalake - permissions: - - user_name: philip.tate@nhs.net - level: CAN_MANAGE - - group_name: prod_env_users - level: CAN_VIEW - - service_principal_name: "${var.prod_env_sp_id}" - level: CAN_MANAGE + +######################################## +#### POC NOTES DELETE LATER ############ +######################################## +## Would have prefered use databrickscfg to hold host and sp names using custom values +## it can set profiles for built in values but not custom ones it seems +## # qqqq secrets C:\Users\NM2.W9215KB2\.databrickscfg will need for git too +## # seems doesnt do cutom ones want to do +## # [staging] +## #staging_host = -> in databricks yml host: ${workspace.staging_host} +## #staging_env_sp_name = +## #staging_storage_name = +####################################### +#### Differences expect for final varsion +###################################### +## storage account per environment +## different host for different databircks work spaces +## will load wheels via artifacts when we have them +## permission groups needed for admin of staging prod etc +####################################### +#### databrickscfg example with custom values didnt work we will only set the token in this file i think +######################################### +# [dev] +# host = https://adb-295718430158257.17.azuredatabricks.net +# token = dapi************************ +# dev_env_sp_name = b0326488-f6b0-4c0f-bf20-a091224bad83 +# dev_storage_name = unifiedrptdeltalake +# test_var = foundatestvar +######################################## +## Catalogs need to be unique because of the databricks metastore +## We will set the catalogs to only be accessible from their corresponding workspace +## that workspace will be linked to it own storage +## dev_catalogs will have unique schema names per user prefixed with their username so that there is seperation for experimentation +## staging_catalog and prod_catalog will have shared schema names +## schema names will not be defined here, a recommended pattern is to have them mirror layer names bronze, silver ... transformations, followed by their "domain" (may not be the right word) e.g. ods +## personal dabs should use personal auth but allow dev_user group view and dev_env_sp to manage it +## prod staging dab permissions should be via gitactions on pull request using service priniciple +## in future we should review permission on staging and prod as should be least privilege and done by service principles unless there is an issue +## we should have no user name based permissions +####################################### +## ref [Evanaze example repo](https://github.com/evanaze/dbx-asset-bundle-deployment) +######################################## +#### END POC NOTES DELETE LATER ########## +######################################## + +###################################### +## Databricks Asset Bundle Definition +###################################### +## ... + +###################################### + +bundle: + name: DatabricksPOC + uuid: ba682f8a-5d14-4ae7-a770-d6a359c0e835 + +# Importing resources. These are split into their own files for modularity +# qqqq comback may be a key place for dry +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +# Then made available to python scriplts via spark.conf.get("bundle.") once exposed by the yml running that file +variables: + catalog: + description: Envirnoment specific catalog name + # schema: + # default: qqqq-deleted-later + # description: Actually setting this with vars but included due to an error + schema_prefix: + default: "" # to be used in dev only so staging prod have no prefix value + description: To prefix dev user schema names to allow multiple dev users to use same catalog with their own different schema names for development + env_name: + description: The environment name (dev, staging, prod) + storage_account: + description: Seperate databricks workspaces dedicated storage (dev, staging, prod) + # Storage principles id is used for name field in databricks.yml name the sp variable: _id + dev_env_sp_id: + default: "b0326488-f6b0-4c0f-bf20-a091224bad83" + staging_env_sp_id: + default: "my-sp-id-jfsdkjhfjsdhfkjh" + prod_env_sp_id: + default: "my-sp-id-jfsdkjhfjsdhfkjh" + +# qqqq will want later if many python files +# artifacts: + # python_artifact: + # type: whl + # build: uv build --wheel + +# Deployment environments +targets: + dev: + # qqqq what is the process of developing and deploying dabs for own area. + # qqqq would it actually be staging with features against folders !!!!??? + + mode: development + # Deploy under own user not service principle + default: true + # This is the the default deployment workspace + workspace: + # profile means in gets set values from [dev] section of databrickscfg + profile: dev + # specify hosts in the databricks.yml not config for clarity + # dev databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + # Dev root path under each users home directory + root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev + variables: + env_name: dev + catalog: dev_catalog + # only dev has schema prefix as its per user + #e.g. phil_ will become phil_bronze_ods if we do layer and domain? as our schema naming convention + schema_prefix: ${workspace.current_user.short_name}_ + # dev storage account + storage_account: unifiedrptdeltalake + permissions: + - level: CAN_MANAGE + service_principal_name: "${var.dev_env_sp_id}" + - level: CAN_MANAGE + # Devs manage our own stuff + user_name: ${workspace.current_user.userName} + - level: CAN_VIEW + # Devs can see each others stuff + group_name: dev_env_users + + + + staging: + # Staging should purely be for investigation and testing prior to going to prod it runs in production mode so will run and constant updates and should be through github actions and service principle on successful pull request + mode: production + workspace: + profile: staging + # staging databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/staging + variables: + env_name: staging + catalog: staging_catalog + # Staging storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: staging_env_users + level: CAN_VIEW + - service_principal_name: "${var.staging_env_sp_id}" + level: CAN_MANAGE + + + prod: + # Automatically deployed to via git actions and service principle minimal testing on deploy as previously run on staging + mode: production + workspace: + profile: prod + # prod databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/prod + variables: + # when 3 databricks it will be a share catalog name across the databricks + env_name: prod + catalog: prod_catalog + # Prod storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: prod_env_users + level: CAN_VIEW + - service_principal_name: "${var.prod_env_sp_id}" + level: CAN_MANAGE diff --git a/devops/user dev dab deploy.ipynb b/devops/user dev dab deploy.ipynb new file mode 100644 index 0000000..f9229c8 --- /dev/null +++ b/devops/user dev dab deploy.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dd35ddc1-acdf-428d-8fd1-0abaf308ceb0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "\n", + "print(\"\\nStep 2: Validate DAB\")\n", + "subprocess.run([\"databricks\", \"bundle\", \"validate\"])\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": -1, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 4 + }, + "notebookName": "user dev dab deploy", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 3790c8faae24ef167cd72bb3b8587e892145cab6 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 14:22:24 +0000 Subject: [PATCH 03/14] exploring cicd secrets --- .github/workflows/staging-cicd.yml | 73 +++++++++++++++++++----------- databricks.yml | 3 +- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index 7b0c524..cc91c25 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -1,11 +1,13 @@ # Rely on ci.yml and branch rules to ensure bundle validation and linting # we are not going from staging straight to prod, because we use staging for manual testing as well +# [use as a ref](https://github.com/evanaze/dbx-asset-bundle-deployment/blob/main/.github/workflows/dev-ci.yml) name: Deploy to Databricks Staging and Trigger Tests in Databricks on: push: branches: - - staging + - staging #longer name for communication in the poc reduce when using it + - staging-data-test-team-shared jobs: deploy_to_staging: @@ -19,38 +21,55 @@ jobs: DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} steps: + - name: Checkout code + uses: actions/checkout@v4 + + ### maybe put in later + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # pip install -r requirements.txt # qqqq add a step to make a version number so can see its changed and print it as part of the job #DAB_VERSION: "staging-${GITHUB_SHA::7}" # short commit hash # databricks bundle deploy --environment staging --version $DAB_VERSION - - uses: actions/checkout@v4 + - name: Set up Databricks CLI - uses: databricks/setup-cli@main - - name: Deploy bundle + + - name: Validate Staging Bundle # trigger target the staging deploy in databricks.yml - run: databricks bundle deploy -t staging --auto-approve - working-directory: . + run: databricks bundle validate -t staging - testing: - name: "Integration Testing" - runs-on: ubuntu-latest - needs: deploy_to_staging - environment: staging - env: - DATABRICKS_HOST: ${{ vars.DBX_HOST }} - DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} - DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + - name: Deploy Staging Bundle + # trigger target the staging deploy in databricks.yml + run: databricks bundle deploy -t staging #??--auto-approve + # working-directory: . - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - id: cache - name: Cache Python build - with: - python-version: "3.x" - cache-dependency-path: "requirements-dev.txt" - cache: "pip" - - name: Install dependencies - run: pip install -r requirements-dev.txt - - name: Integration Testing - run: pytest tests/integration \ No newline at end of file + # testing: + # name: "Integration Testing" + # runs-on: ubuntu-latest + # needs: deploy_to_staging + # environment: staging + # env: + # DATABRICKS_HOST: ${{ vars.DBX_HOST }} + # DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + # DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v5 + # id: cache + # name: Cache Python build + # with: + # python-version: "3.x" + # cache-dependency-path: "requirements-dev.txt" + # cache: "pip" + # - name: Install dependencies + # run: pip install -r requirements-dev.txt + # - name: Integration Testing + # run: pytest tests/integration \ No newline at end of file diff --git a/databricks.yml b/databricks.yml index 450f278..b9a2ce5 100644 --- a/databricks.yml +++ b/databricks.yml @@ -76,10 +76,11 @@ variables: storage_account: description: Seperate databricks workspaces dedicated storage (dev, staging, prod) # Storage principles id is used for name field in databricks.yml name the sp variable: _id + # be nice to hide these in the cfg, github has these in vars anyway not seen this done tried cfg file and not tried a config.yml [example proj with {{ dev_sp }}](https://github.com/evanaze/dbx-asset-bundle-deployment/blob/main/databricks.yml) ... not seeing databricks bundle deploy -t staging --var dev_sp= being specified in the cicd dev_env_sp_id: default: "b0326488-f6b0-4c0f-bf20-a091224bad83" staging_env_sp_id: - default: "my-sp-id-jfsdkjhfjsdhfkjh" + default: "d588f2c8-0c0a-4ded-9da2-0663bf8dd994" prod_env_sp_id: default: "my-sp-id-jfsdkjhfjsdhfkjh" From c137a20e3da6de682447988aec7aa8e7ef3b5821 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 14:26:16 +0000 Subject: [PATCH 04/14] pushing yml --- .github/workflows/staging-cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index cc91c25..bd065d5 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -39,7 +39,7 @@ jobs: - name: Set up Databricks CLI - - uses: databricks/setup-cli@main + uses: databricks/setup-cli@main - name: Validate Staging Bundle # trigger target the staging deploy in databricks.yml From 6e090cb04cf4036829e755dbff5c35eeb1c37ef5 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 14:31:43 +0000 Subject: [PATCH 05/14] git action cant find yml --- .github/workflows/staging-cicd.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index bd065d5..4848a3a 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -13,7 +13,6 @@ jobs: deploy_to_staging: name: "Deploy Bundle to Stage Environment" runs-on: ubuntu-latest - # qqqq need to set up my git env values environment: staging env: DATABRICKS_HOST: ${{ vars.DBX_HOST }} From 91431d2ecb84f237f02176b2f45be3e167aee396 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 14:45:26 +0000 Subject: [PATCH 06/14] staging profile nullifying --- .github/workflows/staging-cicd.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index 4848a3a..ab26a26 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -42,11 +42,11 @@ jobs: - name: Validate Staging Bundle # trigger target the staging deploy in databricks.yml - run: databricks bundle validate -t staging + run: databricks bundle validate -t staging --profile "" # forces use of env vars only - name: Deploy Staging Bundle # trigger target the staging deploy in databricks.yml - run: databricks bundle deploy -t staging #??--auto-approve + run: databricks bundle deploy -t staging --profile "" # forces use of env vars only #??--auto-approve # working-directory: . # testing: From 25f8ff59519056db2f93e85f789046eb73017d6b Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Mon, 8 Dec 2025 15:03:00 +0000 Subject: [PATCH 07/14] git actions should look for cfg file --- .github/workflows/staging-cicd.yml | 4 ++-- databricks.yml | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml index ab26a26..54555dc 100644 --- a/.github/workflows/staging-cicd.yml +++ b/.github/workflows/staging-cicd.yml @@ -42,11 +42,11 @@ jobs: - name: Validate Staging Bundle # trigger target the staging deploy in databricks.yml - run: databricks bundle validate -t staging --profile "" # forces use of env vars only + run: databricks bundle validate -t staging #--profile "" # forces use of env vars only # didnt work - name: Deploy Staging Bundle # trigger target the staging deploy in databricks.yml - run: databricks bundle deploy -t staging --profile "" # forces use of env vars only #??--auto-approve + run: databricks bundle deploy -t staging #--profile "" # forces use of env vars only #??--auto-approve # didnt work # working-directory: . # testing: diff --git a/databricks.yml b/databricks.yml index b9a2ce5..03ccce6 100644 --- a/databricks.yml +++ b/databricks.yml @@ -101,7 +101,7 @@ targets: default: true # This is the the default deployment workspace workspace: - # profile means in gets set values from [dev] section of databrickscfg + # profile means in gets set values from [dev] section of databrickscfg useful on local machine, unneeded in databricks environment profile: dev # specify hosts in the databricks.yml not config for clarity # dev databricks host @@ -126,13 +126,14 @@ targets: # Devs can see each others stuff group_name: dev_env_users - +#delete this staging: # Staging should purely be for investigation and testing prior to going to prod it runs in production mode so will run and constant updates and should be through github actions and service principle on successful pull request mode: production workspace: - profile: staging + # specify in the databricks validate and deploy in cicd github no profile otherwise will fail to find vars, use profile only for deploying from local machnine + # profile: staging #still failing can make the file on the fly but would prefer not to # staging databricks host host: https://adb-295718430158257.17.azuredatabricks.net root_path: /Workspace/.bundle/${bundle.name}/staging @@ -154,7 +155,8 @@ targets: # Automatically deployed to via git actions and service principle minimal testing on deploy as previously run on staging mode: production workspace: - profile: prod + # specify in the databricks validate and deploy in cicd github no profile otherwise will fail to find vars, use profile only for deploying from local machnine + # profile: prod # prod databricks host host: https://adb-295718430158257.17.azuredatabricks.net root_path: /Workspace/.bundle/${bundle.name}/prod From 2b78dc53d4de89bf1c0a33400c3fbfcb2489d681 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 9 Dec 2025 09:49:10 +0000 Subject: [PATCH 08/14] stuff --- .github/workflows/ci.yml | 112 +++++++++--------- devops/README.md | 5 + resources/pipeline/ods_ingestion.yml | 170 +++++++++++++-------------- 3 files changed, 146 insertions(+), 141 deletions(-) create mode 100644 devops/README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 63b7205..fba5fa1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,56 +1,56 @@ -# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing -# such as linting, and unit test for python, maybe dab? verify -# we run these on all pull request because if there is a hot fix it may not have passed through -# staging for example -# qqqq check this is up to date -name: CI - Pull Request Checks - -# Run CI on all pull requests -on: - pull_request: - branches: - - '**' # all branches - -jobs: - ci_checks: - name: "Linting, Unit Tests, DAB Verify" - runs-on: ubuntu-latest - - steps: - # Checkout code - - name: Checkout repository - uses: actions/checkout@v4 - - # Set up Python - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - # Install dependencies used for linting and unit tests - - name: Install dependencies - run: pip install -r requirements-dev.txt - - # Run python unit tests - - name: Run Unit Tests - run: pytest tests/unit - - # Run python lint - # qqqq on example used flake8 instead - # pyproject.toml will need configuring - - name: Run Linting - run: pylint src - - # qqqq to do run commit lint step and put in commit lint config - # see TELBlazor - - name: Commit lint - run: | - echo "Commit lint not implemented" - exit 1 - - # qqqq to do run version generation step and put in commit lint config - # see TELBlazor - - name: Version Generation Test Run - run: | - echo "Version test run not implemented" - exit 1 +# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing +# such as linting, and unit test for python, maybe dab? verify +# we run these on all pull request because if there is a hot fix it may not have passed through +# staging for example +# qqqq check this is up to date +name: CI - Pull Request Checks + +# Run CI on all pull requests +on: + pull_request: + branches: + - '**' # all branches + +jobs: + ci_checks: + name: "Linting, Unit Tests, DAB Verify" + runs-on: ubuntu-latest + + steps: + # Checkout code + - name: Checkout repository + uses: actions/checkout@v4 + + # Set up Python + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + # Install dependencies used for linting and unit tests + - name: Install dependencies + run: pip install -r requirements-dev.txt + + # Run python unit tests + - name: Run Unit Tests + run: pytest tests/unit + + # Run python lint + # qqqq on example used flake8 instead + # pyproject.toml will need configuring + - name: Run Linting + run: pylint src + + # qqqq to do run commit lint step and put in commit lint config + # see TELBlazor + - name: Commit lint + run: | + echo "Commit lint not implemented" + exit 1 + + # qqqq to do run version generation step and put in commit lint config + # see TELBlazor + - name: Version Generation Test Run + run: | + echo "Version test run not implemented" + exit 1 diff --git a/devops/README.md b/devops/README.md new file mode 100644 index 0000000..b4ee10e --- /dev/null +++ b/devops/README.md @@ -0,0 +1,5 @@ +# Development Deployment + +It would be nice without the terminal and without needing to push to github to trigger unit tests, bundle validation, and bundle deployment for the local development user areas. + +This doesnt seem do-able with a notebook, and enabling the terminal is an option, so using the databrick.yml ui deploy, and remembering to triggered any unit tests seems like it will be the process for now. \ No newline at end of file diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml index 2685775..7c91c29 100644 --- a/resources/pipeline/ods_ingestion.yml +++ b/resources/pipeline/ods_ingestion.yml @@ -1,86 +1,86 @@ -############################### -## POC notes - DELETE LATER -############################### -## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense -## We will not define schemas here -## We use this file to expose from databricks.yml the variables we need to set up the pipeline -## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here -############################### -## If we want specific pipeline resource file per .py file we should use this i think - # libraries: - # - notebook: - # path: ../../src/ingestion/ods_ingest.py -## if we want per layer maybe - # libraries: - # - glob: - # # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/**.py -## if we want per domain maybe - # libraries: - # - glob: - # # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/ods_*.py -############################### - -# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach -##### -# If we are running multlipe pipelines we may define all their vars at the top -##### - - -# qqqq -## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml -## bundle for vars originating from databricks.ymly -### i get vars from databricks -## pipeline. from pipeline files -## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from - -## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but -# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. -# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. -variables: - layer: - default: bronze - description: bronze, silver, transfrormations etc - - -x-bronze-config: &bronze-config - bundle.env_name: ${var.env_name} - bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml - pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var - # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" - pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" - -resources: - pipelines: - pipeline_ods_ingestion: - name: ods_ingestion - libraries: - - glob: - # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/ - might work - # include: ../../src/ingestion/*.py - doesnt work - include: ../../src/ingestion/ods_ingest.py - photon: true - # qqqq good practice to specify its something to do with dlt having beta version? - channel: current - # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table - catalog: ${var.catalog} - target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here - serverless: true - # qqqq dont think i need this here DELETE root_path: ../../src/ingestion - # qqqq config is only at pipeline level use yml anchor points if need to reuse - configuration: - ################ Map Databricks Bundle variables to Spark Config Properties ################ - # Map the Bundle variables (from databricks.yml) to Spark config properties - # The key names here MUST match what you use in spark.conf.get() in Python! - # bundle.env_name: ${var.env_name} - # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml - # bundle.storage_account: ${var.storage_account} - ############### Resource yml files for set of pipelines ################# - # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here - # for example this would be - # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") - # configuration: - <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs +############################### +## POC notes - DELETE LATER +############################### +## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense +## We will not define schemas here +## We use this file to expose from databricks.yml the variables we need to set up the pipeline +## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here +############################### +## If we want specific pipeline resource file per .py file we should use this i think + # libraries: + # - notebook: + # path: ../../src/ingestion/ods_ingest.py +## if we want per layer maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/**.py +## if we want per domain maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ods_*.py +############################### + +# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach +##### +# If we are running multlipe pipelines we may define all their vars at the top +##### + + +# qqqq +## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml +## bundle for vars originating from databricks.ymly +### i get vars from databricks +## pipeline. from pipeline files +## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from + +## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but +# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. +# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. +variables: + layer: + default: bronze + description: bronze, silver, transfrormations etc + + +x-bronze-config: &bronze-config + bundle.env_name: ${var.env_name} + bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml + pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var + # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" + pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" + +resources: + pipelines: + pipeline_ods_ingestion: + name: ods_ingestion + libraries: + - glob: + # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ - might work + # include: ../../src/ingestion/*.py - doesnt work + include: ../../src/ingestion/ods_ingest.py + photon: true + # qqqq good practice to specify its something to do with dlt having beta version? + channel: current + # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table + catalog: ${var.catalog} + target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here + serverless: true + # qqqq dont think i need this here DELETE root_path: ../../src/ingestion + # qqqq config is only at pipeline level use yml anchor points if need to reuse + configuration: + ################ Map Databricks Bundle variables to Spark Config Properties ################ + # Map the Bundle variables (from databricks.yml) to Spark config properties + # The key names here MUST match what you use in spark.conf.get() in Python! + # bundle.env_name: ${var.env_name} + # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml + # bundle.storage_account: ${var.storage_account} + ############### Resource yml files for set of pipelines ################# + # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here + # for example this would be + # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") + # configuration: + <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs pipeline.domain: ods # if we then want to apply per pipeline variable here \ No newline at end of file From 70820d5c8dd6200c761cef1213cc7c180fe64243 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 9 Dec 2025 11:09:37 +0000 Subject: [PATCH 09/14] refeactor ods before moving to wheel --- src/ingestion/ods_ingest.py | 379 +++++++++++++++++++++++------------- 1 file changed, 245 insertions(+), 134 deletions(-) diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py index 9812f39..a84419e 100644 --- a/src/ingestion/ods_ingest.py +++ b/src/ingestion/ods_ingest.py @@ -35,152 +35,263 @@ # domain_folder ?? qqqq folder_name = spark.conf.get("pipeline.domain") # ods #folder_location_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{folder_name}/" -folder_location_path = f"{storage_container_path }/{folder_name}/" +folder_location_path = f"{storage_container_path}/{folder_name}/" # "abfss://bronze@unifiedrptdeltalake.dfs.core.windows.net/ods print(folder_location_path) +# qqqq this could be far simpler hardcode "import raw" the just list the names but we want it a bit more flexible +# List of table names +TABLE_NAMES = [ + "Additional_Attributes_Details", + "Code_System_Details", + "Contact_Details", + "Manifest_Details", + "Organisation_Details", + "OtherID_Details", + "PrimaryRole_Details", + "Relationship_Details", + "Role_Details", + "Successor_Details", +] -@dp.table( - # qqqq was f"{schema_name}.Additional_Attributes_Details" but worked before now need to do it this way???!!! - name="Additional_Attributes_Details", - comment="Import raw Additional_Attributes_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Additional_Attributes_Details.csv" - ) - ) -@dp.table( - name="Code_System_Details", - comment="Import raw Code_System_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Code_System_Details.csv" - ) - ) - -@dp.table( - name="Contact_Details", - comment="Import raw Contact_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Contact_Details.csv" - ) - ) - -@dp.table( - name="Manifest_Details", - comment="Import raw Manifest_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Manifest_Details.csv" - ) - ) - -@dp.table( - name="Organisation_Details", - comment="Import raw Organisation_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Organisation_Details.csv" - ) - ) - -@dp.table( - name="OtherID_Details", - comment="Import raw OtherID_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}OtherID_Details.csv" - ) - ) - -@dp.table( - name="PrimaryRole_Details", - comment="Import raw PrimaryRole_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}PrimaryRole_Details.csv" - ) - ) - -@dp.table( - name="Relationship_Details", - comment="Import raw Relationship_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Relationship_Details.csv" - ) - ) - -@dp.table( - name="Role_Details", - comment="Import raw Role_Details" -) -def azure_csv_table(): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load( - f"{folder_location_path}Role_Details.csv" - ) - ) +# Define all ODS tables with their configurations +# tables as keys because unique +# only using dictionary because expect future use to have more to it -@dp.table( - name="Successor_Details", - comment="Import raw Successor_Details" -) -def azure_csv_table(): +# Generate table configurations dynamically +ODS_TABLES = { + table_name: { + "csv_filename": f"{table_name}.csv", + "comment": f"Import raw {table_name}" + } + for table_name in TABLE_NAMES +} + +# ODS_TABLES = { +# "Additional_Attributes_Details": { +# "csv_filename": "Additional_Attributes_Details.csv", +# "comment": "Import raw Additional_Attributes_Details" +# }, +# "Code_System_Details": { +# "csv_filename": "Code_System_Details.csv", +# "comment": "Import raw Code_System_Details" +# }, +# "Contact_Details": { +# "csv_filename": "Contact_Details.csv", +# "comment": "Import raw Contact_Details" +# }, +# "Manifest_Details": { +# "csv_filename": "Manifest_Details.csv", +# "comment": "Import raw Manifest_Details" +# }, +# "Organisation_Details": { +# "csv_filename": "Organisation_Details.csv", +# "comment": "Import raw Organisation_Details" +# }, +# "OtherID_Details": { +# "csv_filename": "OtherID_Details.csv", +# "comment": "Import raw OtherID_Details" +# }, +# "PrimaryRole_Details": { +# "csv_filename": "PrimaryRole_Details.csv", +# "comment": "Import raw PrimaryRole_Details" +# }, +# "Relationship_Details": { +# "csv_filename": "Relationship_Details.csv", +# "comment": "Import raw Relationship_Details" +# }, +# "Role_Details": { +# "csv_filename": "Role_Details.csv", +# "comment": "Import raw Role_Details" +# }, +# "Successor_Details": { +# "csv_filename": "Successor_Details.csv", +# "comment": "Import raw Successor_Details" +# }, +# } +def load_csv_table(base_path, csv_filename): + """Load CSV from Azure storage with standard options""" return ( spark.read.format("csv") .option("header", "true") .option("inferSchema", "true") - .load( - f"{folder_location_path}Successor_Details.csv" - ) + .load(f"{base_path}{csv_filename}") ) +# Create DLT tables dynamically +for table_name, config in ODS_TABLES.items(): + # Create a closure to capture the current loop variables + def create_table(name=table_name, cfg=config): + @dp.table(name=name, comment=cfg["comment"]) + def table_loader(): + return load_csv_table(folder_location_path, cfg["csv_filename"]) + return table_loader + + create_table() + + +# def load_csv_table(table_name): +# """Load CSV from Azure storage with standard options""" +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load(f"{folder_location_path}{table_name}.csv") +# ) + +# # Create DLT tables dynamically +# for table_name, comment in ODS_TABLES: +# # Create a closure to capture the current loop variables +# def create_table(name=table_name, desc=comment): +# @dp.table(name=name, comment=desc) +# def table_loader(): +# return load_csv_table(name) +# return table_loader + +# create_table() + +# @dp.table( +# # qqqq was f"{schema_name}.Additional_Attributes_Details" but worked before now need to do it this way???!!! +# name="Additional_Attributes_Details", +# comment="Import raw Additional_Attributes_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Additional_Attributes_Details.csv" +# ) +# ) + +# @dp.table( +# name="Code_System_Details", +# comment="Import raw Code_System_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Code_System_Details.csv" +# ) +# ) + +# @dp.table( +# name="Contact_Details", +# comment="Import raw Contact_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Contact_Details.csv" +# ) +# ) + +# @dp.table( +# name="Manifest_Details", +# comment="Import raw Manifest_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Manifest_Details.csv" +# ) +# ) + +# @dp.table( +# name="Organisation_Details", +# comment="Import raw Organisation_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Organisation_Details.csv" +# ) +# ) + +# @dp.table( +# name="OtherID_Details", +# comment="Import raw OtherID_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}OtherID_Details.csv" +# ) +# ) + +# @dp.table( +# name="PrimaryRole_Details", +# comment="Import raw PrimaryRole_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}PrimaryRole_Details.csv" +# ) +# ) + +# @dp.table( +# name="Relationship_Details", +# comment="Import raw Relationship_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Relationship_Details.csv" +# ) +# ) + +# @dp.table( +# name="Role_Details", +# comment="Import raw Role_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Role_Details.csv" +# ) +# ) + +# @dp.table( +# name="Successor_Details", +# comment="Import raw Successor_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"{folder_location_path}Successor_Details.csv" +# ) +# ) + ###### Try this # import dlt From d1c8d8cc55071e6378b952f4c5c7c1e49b80c863 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 9 Dec 2025 14:18:08 +0000 Subject: [PATCH 10/14] commit --- databricks.yml | 8 ++--- pyproject.toml | 65 +++++++++++++++++++------------------ src/ingestion/ods_ingest.py | 21 +++++++----- src/utils/.gitinclude | 0 src/utils/__init__.py | 3 ++ src/utils/loaders.py | 21 ++++++++++++ 6 files changed, 73 insertions(+), 45 deletions(-) delete mode 100644 src/utils/.gitinclude create mode 100644 src/utils/__init__.py create mode 100644 src/utils/loaders.py diff --git a/databricks.yml b/databricks.yml index 03ccce6..de31443 100644 --- a/databricks.yml +++ b/databricks.yml @@ -84,11 +84,11 @@ variables: prod_env_sp_id: default: "my-sp-id-jfsdkjhfjsdhfkjh" -# qqqq will want later if many python files +# for external package maybe qqqq i dont think we will export there are poetry bundle example repos if we decide to # artifacts: - # python_artifact: - # type: whl - # build: uv build --wheel +# python_artifact: +# type: whl +# build: uv build --wheel # Deployment environments targets: diff --git a/pyproject.toml b/pyproject.toml index 1707b8a..940124b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,32 +1,33 @@ -[project] -name = "Workflow_POC" -version = "0.0.1" -authors = [{ name = "philip.tate@nhs.net" }] -requires-python = ">=3.10,<=3.13" -dependencies = [ - # Any dependencies for jobs and pipelines in this project can be added here - # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies - # - # LIMITATION: for pipelines, dependencies are cached during development; - # add dependencies to the 'environment' section of your pipeline.yml file instead -] - -[dependency-groups] -dev = [ - "pytest", - "databricks-dlt", - "databricks-connect>=15.4,<15.5", -] - -[project.scripts] -main = "Workflow_POC.main:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src"] - -[tool.black] -line-length = 125 +[project] +name = "Workflow_POC" +version = "0.0.1" +authors = [{ name = "philip.tate@nhs.net" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", +] + +[project.scripts] +main = "Workflow_POC.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +# Wheel packing came in the template :) qqqq +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.black] +line-length = 125 diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py index a84419e..004242b 100644 --- a/src/ingestion/ods_ingest.py +++ b/src/ingestion/ods_ingest.py @@ -2,7 +2,8 @@ # qqqq problem i am having is that we are setting the schema, and dev has schema set as user names # i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script from pyspark import pipelines as dp - +#from utils.loaders import load_csv_table #use wheel instead +from utils.loaders import load_csv_table # Fixed System Constants these and some of this stuff should be going in a helper i think #ADLS_PROTOCOL = "abfss://" @@ -110,14 +111,16 @@ # "comment": "Import raw Successor_Details" # }, # } -def load_csv_table(base_path, csv_filename): - """Load CSV from Azure storage with standard options""" - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load(f"{base_path}{csv_filename}") - ) + +## get from wheel +# def load_csv_table(base_path, csv_filename): +# """Load CSV from Azure storage with standard options""" +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load(f"{base_path}{csv_filename}") +# ) # Create DLT tables dynamically for table_name, config in ODS_TABLES.items(): diff --git a/src/utils/.gitinclude b/src/utils/.gitinclude deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..df638bb --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,3 @@ +from utils.loaders import load_csv_table + +__all__ = ["load_csv_table"] \ No newline at end of file diff --git a/src/utils/loaders.py b/src/utils/loaders.py new file mode 100644 index 0000000..6fc0719 --- /dev/null +++ b/src/utils/loaders.py @@ -0,0 +1,21 @@ +"""Data loading utilities for Databricks pipelines""" +# i dont think we will want these as a package just as a module we wont be expoting and its just and extra steps for analyst which currently i do not think will provide value until they request it and will get in their way +def load_csv_table(base_path, csv_filename): + """Load CSV from Azure storage with standard options + + Args: + base_path: Base path to the folder containing CSV files + csv_filename: Name of the CSV file to load + + Returns: + DataFrame: Spark DataFrame with CSV data + """ + from pyspark.sql import SparkSession + spark = SparkSession.builder.getOrCreate() + + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load(f"{base_path}{csv_filename}") + ) \ No newline at end of file From 7a2cf3636d01d3d63416afcd437b5c7e838a1224 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 9 Dec 2025 16:15:49 +0000 Subject: [PATCH 11/14] try it --- resources/pipeline/ods_ingestion.yml | 4 +++- src/ingestion/ods_ingest.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml index 7c91c29..8a6a1ea 100644 --- a/resources/pipeline/ods_ingestion.yml +++ b/resources/pipeline/ods_ingestion.yml @@ -60,7 +60,9 @@ resources: # if doing a pipeline per layer would do something like # include: ../../src/ingestion/ - might work # include: ../../src/ingestion/*.py - doesnt work - include: ../../src/ingestion/ods_ingest.py + include: ../../src/ingestion/ods_ingest.py # -worked + # include: ../../src/**/*.py + #- folder: ../../src/utils photon: true # qqqq good practice to specify its something to do with dlt having beta version? channel: current diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py index 004242b..4abe78e 100644 --- a/src/ingestion/ods_ingest.py +++ b/src/ingestion/ods_ingest.py @@ -1,6 +1,18 @@ # for making spark tables # qqqq problem i am having is that we are setting the schema, and dev has schema set as user names # i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script + +# This does work, but dont know how the file is working generally when bundle files are not generated anymore, so its not working from within the bundle. +# ultimately probably still want to be in a wheel will see what happens with unit testing it +import sys +import os +sys.path.append(os.path.abspath('..')) + +### TRY ### +# import sys +# bundle_src_path = sys.argv[1] +# sys.path.append(bundle_src_path) + from pyspark import pipelines as dp #from utils.loaders import load_csv_table #use wheel instead from utils.loaders import load_csv_table From d15db3d18381894f5dfdaade7e2d127aa4f9f44d Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 23 Dec 2025 10:16:42 +0000 Subject: [PATCH 12/14] A working test in notebook folder --- .gitignore | 92 ++--- conftest.py-comebackto.txt | 99 ++++++ conftest.py-disablefornow | 99 ++++++ notebooks/__init__.py-nameclashdel | 0 notebooks/unit-tests/ReadMe.md | 2 + .../unit-tests/__init__.py-notimportingfrom | 0 ...ntitled Notebook 2025-12-16 16_15_04.ipynb | 99 ++++++ .../utils/__init__.pynotimportingfrom | 0 .../notebook_integration_ingestion_test.ipynb | 283 ++++++++++++++++ notebooks/unit-tests/utils/run_tests.py | 14 + notebooks/unit-tests/utils/test_loader_5.py | 268 +++++++++++++++ .../utils/test_loader_5.py-dbutils-fail | 115 +++++++ pyproject.toml | 15 +- pytest.ini.-use toml instead.txt | 30 ++ src/__init__.py | 0 src/ingestion/ods_ingest.py | 5 +- src/utils/__init__.py | 4 +- src/utils/loaders.py | 4 +- tests/__init__.pynotapackagepath | 0 tests/data-quality-tests/.gitinclude | 0 tests/unit-tests/ReadMe.md | 19 ++ tests/unit-tests/__init__.pynope | 0 .../bronze/(Clone) (Clone) .gitinclude | 0 .../gold/(Clone) (Clone) .gitinclude | 0 .../unit-tests/ingestion/(Clone) .gitinclude | 0 .../(Clone) (Clone) (Clone) .gitinclude | 0 .../(Clone) (Clone) .gitinclude | 0 tests/unit-tests/utils/fixtures/.gitkeep | 0 tests/unit-tests/utils/fixtures/org_data.csv | 4 + .../utils/test_loaders2 fail.py-fail | 99 ++++++ tests/unit-tests/utils/test_loaders_4.py | 319 ++++++++++++++++++ 31 files changed, 1517 insertions(+), 53 deletions(-) create mode 100644 conftest.py-comebackto.txt create mode 100644 conftest.py-disablefornow create mode 100644 notebooks/__init__.py-nameclashdel create mode 100644 notebooks/unit-tests/ReadMe.md create mode 100644 notebooks/unit-tests/__init__.py-notimportingfrom create mode 100644 notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb create mode 100644 notebooks/unit-tests/utils/__init__.pynotimportingfrom create mode 100644 notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb create mode 100644 notebooks/unit-tests/utils/run_tests.py create mode 100644 notebooks/unit-tests/utils/test_loader_5.py create mode 100644 notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail create mode 100644 pytest.ini.-use toml instead.txt create mode 100644 src/__init__.py create mode 100644 tests/__init__.pynotapackagepath create mode 100644 tests/data-quality-tests/.gitinclude create mode 100644 tests/unit-tests/ReadMe.md create mode 100644 tests/unit-tests/__init__.pynope create mode 100644 tests/unit-tests/bronze/(Clone) (Clone) .gitinclude create mode 100644 tests/unit-tests/gold/(Clone) (Clone) .gitinclude create mode 100644 tests/unit-tests/ingestion/(Clone) .gitinclude create mode 100644 tests/unit-tests/silver/(Clone) (Clone) (Clone) .gitinclude create mode 100644 tests/unit-tests/transformations/(Clone) (Clone) .gitinclude create mode 100644 tests/unit-tests/utils/fixtures/.gitkeep create mode 100644 tests/unit-tests/utils/fixtures/org_data.csv create mode 100644 tests/unit-tests/utils/test_loaders2 fail.py-fail create mode 100644 tests/unit-tests/utils/test_loaders_4.py diff --git a/.gitignore b/.gitignore index 840b1da..37b748f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,47 +1,47 @@ -# Couldnt find an official gitignore this is AI generated -# ----------------------------- -# Databricks / DAB / dbx -# ----------------------------- -.databricks/ # local workspace metadata / CLI files -.deploy/ # local deploy cache (dbx/DAB) -.bundle/ # local bundle files (dbx/DAB) -*.log # temporary logs -*.tmp # temporary files -dbx_project.yaml.bak # backup of bundle config -build/ -dist/ - -# ----------------------------- -# Python -# ----------------------------- -__pycache__/ -*.pyc -*.pyo -*.pyd -*.egg-info/ -.venv/ -env/ -pip-selfcheck.json - -# ----------------------------- -# Jupyter Notebooks -# ----------------------------- -.ipynb_checkpoints/ - -# ----------------------------- -# Scratch / experimental folder -# ----------------------------- -scratch/** # ignore all files in scratch -!scratch/README.md # except placeholder README.md - -# ----------------------------- -# IDE / editor -# ----------------------------- -.vscode/ -.idea/ - -# ----------------------------- -# OS / system -# ----------------------------- -.DS_Store +# Couldnt find an official gitignore this is AI generated +# ----------------------------- +# Databricks / DAB / dbx +# ----------------------------- +.databricks/ # local workspace metadata / CLI files +.deploy/ # local deploy cache (dbx/DAB) +.bundle/ # local bundle files (dbx/DAB) +*.log # temporary logs +*.tmp # temporary files +dbx_project.yaml.bak # backup of bundle config +build/ +dist/ + +# ----------------------------- +# Python +# ----------------------------- +__pycache__/ +*.pyc +*.pyo +*.pyd +*.egg-info/ +.venv/ +env/ +pip-selfcheck.json + +# ----------------------------- +# Jupyter Notebooks +# ----------------------------- +.ipynb_checkpoints/ + +# ----------------------------- +# Scratch / experimental folder +# ----------------------------- +scratch/** # ignore all files in scratch +!scratch/README.md # except placeholder README.md + +# ----------------------------- +# IDE / editor +# ----------------------------- +.vscode/ +.idea/ + +# ----------------------------- +# OS / system +# ----------------------------- +.DS_Store Thumbs.db \ No newline at end of file diff --git a/conftest.py-comebackto.txt b/conftest.py-comebackto.txt new file mode 100644 index 0000000..fb99be0 --- /dev/null +++ b/conftest.py-comebackto.txt @@ -0,0 +1,99 @@ +# copy paste from [Dab repo examples](https://github.com/databricks/bundle-examples/blob/1cf3dba30a897d68e3e74ab17f0a3dff68392f15/default_python/tests/conftest.py) +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError( + "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." + ) + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() \ No newline at end of file diff --git a/conftest.py-disablefornow b/conftest.py-disablefornow new file mode 100644 index 0000000..fb99be0 --- /dev/null +++ b/conftest.py-disablefornow @@ -0,0 +1,99 @@ +# copy paste from [Dab repo examples](https://github.com/databricks/bundle-examples/blob/1cf3dba30a897d68e3e74ab17f0a3dff68392f15/default_python/tests/conftest.py) +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError( + "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." + ) + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() \ No newline at end of file diff --git a/notebooks/__init__.py-nameclashdel b/notebooks/__init__.py-nameclashdel new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/unit-tests/ReadMe.md b/notebooks/unit-tests/ReadMe.md new file mode 100644 index 0000000..3541e50 --- /dev/null +++ b/notebooks/unit-tests/ReadMe.md @@ -0,0 +1,2 @@ +# Thoughts +I am unsure if we would have layer based folders to mirror the unit tests here. it may depend on if we would just run the lot. Or if in fact we would use notebooks but would use some kind of built in test runner. diff --git a/notebooks/unit-tests/__init__.py-notimportingfrom b/notebooks/unit-tests/__init__.py-notimportingfrom new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb b/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb new file mode 100644 index 0000000..163d1a1 --- /dev/null +++ b/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "317bf231-1e75-47ac-989b-49a2299cf416", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Trying out cells\n", + "\n", + "Not running pyspark successfully here\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7bdae43d-b7da-419f-acdc-16606c7a5823", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pytest\n", + "from pyspark.sql import SparkSession" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "cdc41037-b32e-418d-9a15-99cf81241569", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "**Extracting Data**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4e4d6d9a-2d09-4710-8774-c2bc4d9717e9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Untitled Notebook 2025-12-16 16_15_04", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/unit-tests/utils/__init__.pynotimportingfrom b/notebooks/unit-tests/utils/__init__.pynotimportingfrom new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb b/notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb new file mode 100644 index 0000000..4fb4e24 --- /dev/null +++ b/notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2211afe9-d3ee-48ab-b94c-e9cbd4fd7cad", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Notebook advantages\n", + "Has dbutils and spark\n", + "# Issues\n", + "Pathing is awful\n", + " - EOE do have a function to help\n", + " - DBX suggests wheels which is more technical\n", + "Hardcoding things dont want to so need to add them to some secrets\n", + "Spark may not run in pytest but is running here\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bdf58cb9-497d-41ea-a320-4d427cd51727", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys, os\n", + "# we need better routing\n", + "PROJECT_ROOT = \"/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src\"\n", + "sys.path.insert(0, PROJECT_ROOT)\n", + "\n", + "print(\"Project root:\", PROJECT_ROOT)\n", + "print(\"Contents:\", os.listdir(PROJECT_ROOT))\n", + "\n", + "from utils import load_csv_table\n", + "\n", + "\n", + "# Example configuration\n", + "layer = \"bronze\"\n", + "domain = \"ods\"\n", + "# had to do via command line locally need azure key vault access\n", + "storage_account_url = dbutils.secrets.get(scope=\"poc-secrets\", key=\"storage_account_url\")\n", + "base_path = f\"abfss://{layer}{storage_account_url}/{domain}/\"\n", + "csv_filename = \"Contact_Details.csv\"\n", + "\n", + "# Load the CSV\n", + "#df = spark.read.option('header', 'true').csv(os.path.join(base_path, csv_filename))\n", + "df = load_csv_table(spark, base_path, csv_filename)\n", + "# Preview\n", + "df.show(5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5d458372-8f7e-4730-b1cc-758c3e50a499", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Just better secret storage\n", + " - # Example configuration\n", + "layer = \"bronze\"\n", + "domain = \"ods\"\n", + "# had to do via command line locally need azure key vault access\n", + "storage_account_url = dbutils.secrets.get(scope=\"poc-secrets\", key=\"storage_account_url\")\n", + "base_path = f\"abfss://{layer}{storage_account_url}/{domain}/\"\n", + "csv_filename = \"Contact_Details.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a98cac63-2105-4549-918d-1392fae31c97", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# List all secret scopes available in your workspace\n", + "# scopes = dbutils.secrets.listScopes()\n", + "# for scope in scopes:\n", + "# print(scope.name)\n", + "\n", + "# keys = dbutils.secrets.list(\"UnifiedReportingDevKeyVault\")\n", + "# for key in keys:\n", + "# print(key.key)\n", + "\n", + "keys = dbutils.secrets.list(\"poc-secrets\")\n", + "for key in keys:\n", + " print(key.key)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bde3d03b-f58b-45e9-8b7d-b366023c2ce7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Test logic\n", + "First we need to just know can compare against a DF before we worry about test detection\n", + "\n", + "Then we need pytest" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3dcd99c3-da1a-42c1-b65d-a4ab4e442205", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "\"\"\"Simple test for load_csv_table - Databricks UI version\"\"\"\n", + "\n", + "from pyspark.sql import SparkSession\n", + "import sys\n", + "\n", + "# Routing\n", + "PROJECT_ROOT = \"/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src\"\n", + "sys.path.insert(0, PROJECT_ROOT)\n", + "\n", + "from utils import load_csv_table\n", + "\n", + "\n", + "def test_load_csv_table():\n", + " \"\"\"Test that load_csv_table loads CSV correctly\"\"\"\n", + " \n", + " print(\"🧪 Starting test...\")\n", + " \n", + " # Get Spark session\n", + " spark = SparkSession.getActiveSession()\n", + " \n", + " # Create test data\n", + " data = [\n", + " (\"ORG001\", \"Test Hospital\", \"Active\"),\n", + " (\"ORG002\", \"Test Clinic\", \"Inactive\"),\n", + " (\"ORG003\", \"Test Surgery\", \"Active\")\n", + " ]\n", + " columns = [\"OrganisationID\", \"Name\", \"Status\"]\n", + " \n", + " test_df = spark.createDataFrame(data, columns)\n", + " \n", + " # Use DBFS temp location (works better in Databricks UI)\n", + " import random\n", + " test_id = random.randint(10000, 99999)\n", + " temp_path = f\"/tmp/test_csv_{test_id}\"\n", + " \n", + " print(f\"📁 Writing test data to: {temp_path}\")\n", + " \n", + " try:\n", + " # Write test data\n", + " test_df.coalesce(1).write.csv(\n", + " temp_path,\n", + " header=True,\n", + " mode=\"overwrite\"\n", + " )\n", + " \n", + " # Find the actual CSV file (Spark creates part-*.csv)\n", + " files = dbutils.fs.ls(temp_path)\n", + " csv_file = [f for f in files if f.name.startswith(\"part-\") and f.name.endswith(\".csv\")][0]\n", + " \n", + " # Rename to expected filename\n", + " new_path = f\"{temp_path}/Contact_Details.csv\"\n", + " dbutils.fs.mv(csv_file.path, new_path)\n", + " \n", + " # TEST: Load the CSV\n", + " base_path = temp_path + \"/\"\n", + " csv_filename = \"Contact_Details.csv\"\n", + " \n", + " print(f\"📂 Loading from: {base_path}{csv_filename}\")\n", + " df = load_csv_table(spark, base_path, csv_filename)\n", + " \n", + " # Verify results\n", + " print(f\"✓ DataFrame created: {df is not None}\")\n", + " print(f\"✓ Row count: {df.count()} (expected 3)\")\n", + " print(f\"✓ Columns: {df.columns}\")\n", + " \n", + " rows = df.collect()\n", + " print(f\"✓ First row data: {rows[0]['OrganisationID']}, {rows[0]['Name']}, {rows[0]['Status']}\")\n", + " \n", + " # Show the data\n", + " print(\"\\n📊 Loaded data:\")\n", + " df.show()\n", + " \n", + " # Assertions\n", + " assert df is not None, \"❌ DataFrame is None\"\n", + " assert df.count() == 3, f\"❌ Expected 3 rows, got {df.count()}\"\n", + " assert \"OrganisationID\" in df.columns, \"❌ Missing OrganisationID column\"\n", + " assert \"Name\" in df.columns, \"❌ Missing Name column\"\n", + " assert \"Status\" in df.columns, \"❌ Missing Status column\"\n", + " assert rows[0][\"OrganisationID\"] == \"ORG001\", \"❌ Wrong data in first row\"\n", + " \n", + " print(\"\\n✅ ALL TESTS PASSED!\")\n", + " \n", + " return df\n", + " \n", + " finally:\n", + " # Clean up test data\n", + " try:\n", + " print(f\"\\n🧹 Cleaning up: {temp_path}\")\n", + " dbutils.fs.rm(temp_path, recurse=True)\n", + " print(\"✓ Cleanup complete\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Cleanup failed (not critical): {e}\")\n", + "\n", + "\n", + "# 🚀 RUN THE TEST NOW\n", + "test_load_csv_table()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "notebook_integration_ingestion_test", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/unit-tests/utils/run_tests.py b/notebooks/unit-tests/utils/run_tests.py new file mode 100644 index 0000000..78f81f1 --- /dev/null +++ b/notebooks/unit-tests/utils/run_tests.py @@ -0,0 +1,14 @@ + +import sys +import pytest + +# Prevent __pycache__ creation +sys.dont_write_bytecode = True + +# Run your test file +pytest.main([ + "test_loader_5.py", # your test file + "-v", # verbose + "-s", # show print output + "--tb=short" # short tracebacks +]) diff --git a/notebooks/unit-tests/utils/test_loader_5.py b/notebooks/unit-tests/utils/test_loader_5.py new file mode 100644 index 0000000..221f8d4 --- /dev/null +++ b/notebooks/unit-tests/utils/test_loader_5.py @@ -0,0 +1,268 @@ +import sys, os +import shutil +import random +import pytest +from pathlib import Path +from pyspark.sql import SparkSession +import random +from pyspark.dbutils import DBUtils + + +############################### +## This is more integration now anyway i need a core function test for unit +## Spark and dbutils are not going to work in git +## Routing wont work in git actions +## Need to move things into testing config +## need to move thing into toml +## file needs to be in correct location +############################## + +# we need better routing +PROJECT_ROOT = "/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src" +sys.path.insert(0, PROJECT_ROOT) + +from utils import load_csv_table + + +###################################### +### PyTest doesnt have access to spark and dbutils in the same way so we are checking the test setup here ### +###################################### + +# Use existing Spark session in Databricks (Spark Connect) +@pytest.fixture(scope="session") +def spark(): + session = SparkSession.getActiveSession() + if session is None: + raise RuntimeError("No active Spark session found. Ensure you are running in Databricks.") + return session + +def test_load_csv_table_function_using_dbfs(spark): + """ + Test loading CSV from DBFS - THIS SHOULD WORK + """ + dbutils = DBUtils(spark) + + # Create test data + test_data = [ + ("ORG001", "Test Hospital", "Active"), + ("ORG002", "Test Clinic", "Inactive"), + ("ORG003", "Test Surgery", "Active") + ] + columns = ["OrganisationID", "Name", "Status"] + test_df = spark.createDataFrame(test_data, columns) + + # Write to DBFS + test_id = random.randint(10000, 99999) + dbfs_path = f"/tmp/test_dbfs_{test_id}" + filename = "Contact_Details.csv" + + # Write using Spark (creates directory with part files) + test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(dbfs_path) + + print(f"✓ Wrote to DBFS: {dbfs_path}") + + # Setup paths + test_id = random.randint(10000, 99999) + dbfs_folder = f"/tmp/test_dbfs_{test_id}" + filename = "Contact_Details.csv" + + # STEP 1: Write to temporary location (Spark creates part files) + temp_write_path = f"{dbfs_folder}/temp_write" + test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(temp_write_path) + + print(f"✓ Wrote to temp location: {temp_write_path}") + + # STEP 2: Find the actual CSV part file that was created + files = dbutils.fs.ls(temp_write_path) + csv_part_file = [f.path for f in files if f.path.endswith('.csv')][0] + + print(f"✓ Found part file: {csv_part_file}") + + # STEP 3: Copy it to a proper filename + final_csv_path = f"{dbfs_folder}/{filename}" + dbutils.fs.cp(csv_part_file, final_csv_path) + + print(f"✓ Copied to proper filename: {final_csv_path}") + + # Test our function with DBFS path + base_path = f"dbfs:{dbfs_folder}/" + df = load_csv_table(spark, base_path, filename) + # worked but its using partials files in a folder directory - i want to test with an actual full file + # df = load_csv_table_copy(spark, base_path, "") # Empty filename since Spark reads the whole directory + + print(f"✓ Loaded using: load_csv_table(spark, '{base_path}', '{filename}')") + df.show() + + # Assertions + assert df.count() == 3, f"Expected 3 rows, got {df.count()}" + assert "OrganisationID" in df.columns + assert "Name" in df.columns + assert "Status" in df.columns + + first_row = df.collect()[0] + assert first_row["OrganisationID"] == "ORG001" + + # Cleanup + dbutils.fs.rm(dbfs_path, recurse=True) + print("✅ DBFS test PASSED!") + +####################################################### +###################################################### +## Junk because poc but very small chance useful for what tried if issues soon +######################################################## +#### not accessible by spark +# Fixture to create a temporary CSV for testing +# @pytest.fixture +# def mock_csv_data(spark, tmp_path): +# # Sample data +# data = [ +# ("ORG001", "Test Hospital", "Active"), +# ("ORG002", "Test Clinic", "Inactive"), +# ("ORG003", "Test Surgery", "Active") +# ] +# columns = ["OrganisationID", "Name", "Status"] +# test_df = spark.createDataFrame(data, columns) + +# # Unique temp folder +# test_id = random.randint(10000, 99999) +# temp_path = tmp_path / f"test_csv_{test_id}" +# temp_path.mkdir(parents=True, exist_ok=True) + +# # Databricks-safe CSV write: convert to Pandas and write +# csv_path = temp_path / "Contact_Details.csv" +# test_df.toPandas().to_csv(csv_path, index=False) + +# yield str(temp_path), "Contact_Details.csv" + +# # Cleanup +# shutil.rmtree(temp_path, ignore_errors=True) + +# # Example test using the fixture +# def test_csv_exists(mock_csv_data): +# folder, filename = mock_csv_data +# full_path = os.path.join(folder, filename) +# print(f"Checking CSV file exists at: {full_path}") +# assert os.path.exists(full_path) + +# Example using dbutils worked +# def test_dbutils_write_and_read_simple(spark): +# """ +# Simple test: Can we write a CSV to DBFS and read it back with Spark? +# This proves the core concept works. +# """ +# from pyspark.dbutils import DBUtils +# dbutils = DBUtils(spark) + +# # 0. CLEAN UP ANY STALE DATA FIRST +# test_path = "/tmp/simple_test_12345" +# try: +# dbutils.fs.rm(test_path, recurse=True) +# print(f"✓ Cleaned up stale data at: {test_path}") +# except: +# print(f"✓ No stale data to clean") + +# # 1. Create test data +# test_data = [("A", 1), ("B", 2), ("C", 3)] +# test_df = spark.createDataFrame(test_data, ["letter", "number"]) + +# # 2. Write to DBFS using Spark +# test_path = "/tmp/simple_test_12345" +# test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(test_path) + +# print(f"✓ Wrote CSV to: {test_path}") + +# # 3. Check it exists using dbutils +# files = dbutils.fs.ls(test_path) +# print(f"✓ Files in DBFS: {[f.name for f in files]}") + +# # 4. Read it back using Spark +# df_read = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(f"dbfs:{test_path}") + +# print(f"✓ Read back {df_read.count()} rows") +# df_read.show() + +# # 5. Verify data +# assert df_read.count() == 3 +# assert "letter" in df_read.columns +# assert "number" in df_read.columns + +# # 6. Cleanup +# dbutils.fs.rm(test_path, recurse=True) +# print(f"✓ Cleaned up: {test_path}") + +# print("✅ SUCCESS: dbutils works in .py file, can write and read from DBFS!") + +#################################################################### +#### Using our test CSV test the loader function +######################################################## +################################################# +## Hardcode in function for now instead of reference +################################################### +## This is what we want to use ultimately +# df = load_csv_table(spark, folder + "/", filename) +# def load_csv_table_copy(spark, base_path, csv_filename): +# """Load CSV from Azure storage with standard options + +# Args: +# base_path: Base path to the folder containing CSV files +# csv_filename: Name of the CSV file to load + +# Returns: +# DataFrame: Spark DataFrame with CSV data +# """ + +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load(f"{base_path}{csv_filename}") +# ) + +# # Actual test using the fixture +# def test_load_csv_table_function(spark, mock_csv_data): +# folder, filename = mock_csv_data + +# print("Folder path:", folder) +# print("Filename:", filename) +# full_path = folder + "/" + filename +# print("Full CSV path passed to loader:", full_path) +# print("Does file exist?", os.path.exists(full_path)) + +# # Load using your function + + +# df = load_csv_table_copy(spark, folder + "/", filename) + + +# # Print DataFrame schema and first few rows +# print("DataFrame schema:") +# df.printSchema() +# print("First 5 rows:") +# df.show(5) + +# # Assertions +# assert df is not None, "DataFrame should not be None" +# count = df.count() +# print("Row count:", count) +# assert count == 3, f"Expected 3 rows, got {count}" +# assert "OrganisationID" in df.columns +# assert "Name" in df.columns +# assert "Status" in df.columns + +# # Check first row +# first_row = df.collect()[0] +# print("First row:", first_row.asDict()) +# assert first_row["OrganisationID"] == "ORG001" +# assert first_row["Name"] == "Test Hospital" +# assert first_row["Status"] == "Active" + + +# %pip install -e "/Workspace/Repos/philip.tate@nhs.net/PT Separate Feature Branch" +# %pip install pytest>=7.0 +# !python -m pytest # tests/unit-tests/ -v -p no:cacheprovider +# import os +# os.environ['PYTHONDONTWRITEBYTECODE'] = '1' +# import pytest +# tests_path = "/Workspace/Repos/philip.tate@nhs.net/PT Separate Feature Branch/tests/unit-tests/" +# !python -m pytest tests/unit-tests/ -v -p no:cacheprovider +# pytest.main([tests_path, "-v", "-p", "no:cacheprovider"]) \ No newline at end of file diff --git a/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail b/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail new file mode 100644 index 0000000..a91ce5f --- /dev/null +++ b/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail @@ -0,0 +1,115 @@ +"""Pytest unit tests for data loading utilities""" + +import pytest +from pyspark.sql import SparkSession +import sys +import random + +# Routing +PROJECT_ROOT = "/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src" +sys.path.insert(0, PROJECT_ROOT) + +from utils import load_csv_table + + +@pytest.fixture(scope="session") +def spark(): + """Get the active Spark session (Databricks)""" + return SparkSession.getActiveSession() + + +@pytest.fixture +def mock_csv_data(spark): + """ + Create test CSV data in DBFS temp location. + Automatically cleans up after the test. + """ + # Create test data + data = [ + ("ORG001", "Test Hospital", "Active"), + ("ORG002", "Test Clinic", "Inactive"), + ("ORG003", "Test Surgery", "Active") + ] + columns = ["OrganisationID", "Name", "Status"] + + test_df = spark.createDataFrame(data, columns) + + # Use DBFS temp location with unique ID + test_id = random.randint(10000, 99999) + temp_path = f"/tmp/test_csv_{test_id}" + + # Write test data + test_df.coalesce(1).write.csv( + temp_path, + header=True, + mode="overwrite" + ) + + # Find and rename the CSV file + # qqqq butils in pytest or .py will not work because not a notebook + files = dbutils.fs.ls(temp_path) + csv_file = [f for f in files if f.name.startswith("part-") and f.name.endswith(".csv")][0] + new_path = f"{temp_path}/Contact_Details.csv" + dbutils.fs.mv(csv_file.path, new_path) + + # Yield the paths to the test + yield temp_path + "/", "Contact_Details.csv" + + # Cleanup after test + try: + dbutils.fs.rm(temp_path, recurse=True) + except Exception as e: + print(f"Warning: Failed to clean up {temp_path}: {e}") + + +def test_load_csv_table_basic(spark, mock_csv_data): + """Test that load_csv_table loads CSV correctly""" + base_path, csv_filename = mock_csv_data + + # Load the CSV + df = load_csv_table(spark, base_path, csv_filename) + + # Verify DataFrame was created + assert df is not None, "DataFrame should not be None" + + # Verify row count + assert df.count() == 3, f"Expected 3 rows, got {df.count()}" + + # Verify columns exist + columns = df.columns + assert "OrganisationID" in columns, "Missing OrganisationID column" + assert "Name" in columns, "Missing Name column" + assert "Status" in columns, "Missing Status column" + + # Verify data content + rows = df.collect() + assert rows[0]["OrganisationID"] == "ORG001", "First row OrganisationID incorrect" + assert rows[0]["Name"] == "Test Hospital", "First row Name incorrect" + assert rows[0]["Status"] == "Active", "First row Status incorrect" + + +def test_load_csv_table_column_names(spark, mock_csv_data): + """Test that column names are preserved correctly""" + base_path, csv_filename = mock_csv_data + + df = load_csv_table(spark, base_path, csv_filename) + + expected_columns = ["OrganisationID", "Name", "Status"] + assert df.columns == expected_columns, f"Columns don't match. Got: {df.columns}" + + +def test_load_csv_table_all_rows_loaded(spark, mock_csv_data): + """Test that all rows are loaded without data loss""" + base_path, csv_filename = mock_csv_data + + df = load_csv_table(spark, base_path, csv_filename) + + # Check all expected organisations are present + org_ids = [row["OrganisationID"] for row in df.collect()] + assert "ORG001" in org_ids + assert "ORG002" in org_ids + assert "ORG003" in org_ids + + +# 🚀 RUN THE TESTS NOW +pytest.main([__file__, "-v", "-s", "--tb=short"]) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 940124b..2ee3f70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,8 @@ name = "Workflow_POC" version = "0.0.1" authors = [{ name = "philip.tate@nhs.net" }] +description = "POC" +readme = "README.md" requires-python = ">=3.10,<=3.13" dependencies = [ # Any dependencies for jobs and pipelines in this project can be added here @@ -13,7 +15,7 @@ dependencies = [ [dependency-groups] dev = [ - "pytest", + # "pytest", "databricks-dlt", "databricks-connect>=15.4,<15.5", ] @@ -31,3 +33,14 @@ packages = ["src"] [tool.black] line-length = 125 + +# will tis clash with conftest +#[tool.pytest.ini_options] +#minversion = "6.0" +#testpaths = ["tests/unit-tests"] +#pythonpath = ["src"] +#python_files = ["test_*.py"] +#addopts = [ +# "-v", +# "--tb=short" +#] diff --git a/pytest.ini.-use toml instead.txt b/pytest.ini.-use toml instead.txt new file mode 100644 index 0000000..78b40db --- /dev/null +++ b/pytest.ini.-use toml instead.txt @@ -0,0 +1,30 @@ +# qqqq todo ai generated ask someone to scan over it and give a read of the ins and outs +[pytest] +# Pytest configuration for Databricks unit tests + +# Test discovery patterns +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Test paths +testpaths = tests + +# Output options +addopts = + -v + --tb=short + --strict-markers + --disable-warnings + +# Markers for organizing tests +markers = + unit: Unit tests that don't require external resources + integration: Integration tests that may require external systems + slow: Tests that take a long time to run + +# Minimum Python version +minversion = 3.8 + +# Directory patterns to ignore +norecursedirs = .git .tox dist build *.egg .venv venv \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py index 4abe78e..c695702 100644 --- a/src/ingestion/ods_ingest.py +++ b/src/ingestion/ods_ingest.py @@ -1,3 +1,4 @@ + # for making spark tables # qqqq problem i am having is that we are setting the schema, and dev has schema set as user names # i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script @@ -6,6 +7,7 @@ # ultimately probably still want to be in a wheel will see what happens with unit testing it import sys import os +# ai says This is an acceptable workaround for running code directly in Databricks pipelines/jobs when you haven't built a Python wheel. sys.path.append(os.path.abspath('..')) ### TRY ### @@ -140,7 +142,8 @@ def create_table(name=table_name, cfg=config): @dp.table(name=name, comment=cfg["comment"]) def table_loader(): - return load_csv_table(folder_location_path, cfg["csv_filename"]) + # spark is defined by databricks environment so may not need to directly define it + return load_csv_table( spark, folder_location_path, cfg["csv_filename"]) return table_loader create_table() diff --git a/src/utils/__init__.py b/src/utils/__init__.py index df638bb..c065388 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,3 +1,3 @@ -from utils.loaders import load_csv_table - +# qqqq worked but not for test -> from utils.loaders import load_csv_table +from .loaders import load_csv_table __all__ = ["load_csv_table"] \ No newline at end of file diff --git a/src/utils/loaders.py b/src/utils/loaders.py index 6fc0719..68a4f5b 100644 --- a/src/utils/loaders.py +++ b/src/utils/loaders.py @@ -1,6 +1,6 @@ """Data loading utilities for Databricks pipelines""" # i dont think we will want these as a package just as a module we wont be expoting and its just and extra steps for analyst which currently i do not think will provide value until they request it and will get in their way -def load_csv_table(base_path, csv_filename): +def load_csv_table(spark, base_path, csv_filename): """Load CSV from Azure storage with standard options Args: @@ -10,8 +10,6 @@ def load_csv_table(base_path, csv_filename): Returns: DataFrame: Spark DataFrame with CSV data """ - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() return ( spark.read.format("csv") diff --git a/tests/__init__.pynotapackagepath b/tests/__init__.pynotapackagepath new file mode 100644 index 0000000..e69de29 diff --git a/tests/data-quality-tests/.gitinclude b/tests/data-quality-tests/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/ReadMe.md b/tests/unit-tests/ReadMe.md new file mode 100644 index 0000000..bd9bb5c --- /dev/null +++ b/tests/unit-tests/ReadMe.md @@ -0,0 +1,19 @@ +# Unit Tests + +The unit tests will allow us to reuse code confidently. We can change how a utility function does what it does and know it has not broken other code dependent on it for example. + +# Running unit tests + +## Running in Databricks + +## How github runs Unit Test + +## How to add to Unit Tests + +## When to add Unit Tests + +## Cursory Notes on Peer Reviewing Unit Tests or recommend their addition + +# Folder Structure + +The structure is mirroring the src/layer structure \ No newline at end of file diff --git a/tests/unit-tests/__init__.pynope b/tests/unit-tests/__init__.pynope new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/bronze/(Clone) (Clone) .gitinclude b/tests/unit-tests/bronze/(Clone) (Clone) .gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/gold/(Clone) (Clone) .gitinclude b/tests/unit-tests/gold/(Clone) (Clone) .gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/ingestion/(Clone) .gitinclude b/tests/unit-tests/ingestion/(Clone) .gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/silver/(Clone) (Clone) (Clone) .gitinclude b/tests/unit-tests/silver/(Clone) (Clone) (Clone) .gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/transformations/(Clone) (Clone) .gitinclude b/tests/unit-tests/transformations/(Clone) (Clone) .gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/utils/fixtures/.gitkeep b/tests/unit-tests/utils/fixtures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit-tests/utils/fixtures/org_data.csv b/tests/unit-tests/utils/fixtures/org_data.csv new file mode 100644 index 0000000..02795cd --- /dev/null +++ b/tests/unit-tests/utils/fixtures/org_data.csv @@ -0,0 +1,4 @@ +OrganisationID,Name,Status +ORG001,Test Hospital,Active +ORG002,Test Clinic,Inactive +ORG003,Test Surgery,Active \ No newline at end of file diff --git a/tests/unit-tests/utils/test_loaders2 fail.py-fail b/tests/unit-tests/utils/test_loaders2 fail.py-fail new file mode 100644 index 0000000..3dd40d9 --- /dev/null +++ b/tests/unit-tests/utils/test_loaders2 fail.py-fail @@ -0,0 +1,99 @@ +import pytest +from pyspark.sql import SparkSession +from pathlib import Path +import os +import sys + +# --- HELPER FUNCTION TO GET DBUTILS (Required fix) --- +def get_dbutils(spark: SparkSession): + """ + Safely retrieves the dbutils object, necessary when spark.dbutils is not available + due to Spark Connect or other environment settings. + """ + try: + # Tries the standard Databricks notebook method + from pyspark.sql import SQLContext # Need this import for the notebook method + return SQLContext(spark.sparkContext).createDataFrame([('1', '2')], ['col1', 'col2']).sqlContext.sparkSession._jvm.com.databricks.dbutils.DBUtils.getDBUtils(spark.sparkContext) + except Exception: + # Fallback for alternative environments + try: + from IPython import get_ipython + return get_ipython().user_ns["dbutils"] + except: + # Final fallback, though unlikely to work if the above failed + raise Exception("Cannot find DBUtils object.") + +# --- Your Unmodified Production Function (Included for context) --- +def load_csv_table(spark, base_path, csv_filename): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load(f"{base_path}{csv_filename}") + ) + +# --- THE WORKING FIXTURE (Using the safe DBUtils getter) --- + +@pytest.fixture +def sample_csv_data(spark: SparkSession, tmp_path: Path): + """ + STAGES the static CSV file to a temporary DBFS location + to bypass the WorkspaceLocalFileSystem security restriction. + """ + # 1. Define Paths + csv_filename = "org_data.csv" + fixture_dir = "tests/unit-tests/utils/fixtures" + + local_base_path = Path(os.getcwd()) / fixture_dir + local_file_path = local_base_path / csv_filename + + # Define the TEMPORARY path on DBFS + # We use tmp_path name to ensure uniqueness + dbfs_staging_path = f"dbfs:/tmp/pytest_data/{tmp_path.name}/" + + # 2. Write the CSV content locally (if file doesn't exist) + csv_content = """OrganisationID,Name,Status +ORG001,Test Hospital,Active +ORG002,Test Clinic,Inactive +ORG003,Test Surgery,Active""" + + local_base_path.mkdir(parents=True, exist_ok=True) + local_file_path.write_text(csv_content) + + # 3. CRITICAL STEP: COPY the file from Local Disk to DBFS + try: + # FIX: Get dbutils using the helper function + dbutils = get_dbutils(spark) + + # Source path MUST start with 'file:' prefix for dbutils.fs.cp + local_source = f"file://{str(local_file_path)}" + + # Destination path is the folder on DBFS + dbutils.fs.cp(local_source, dbfs_staging_path, recurse=True) + + except Exception as e: + # This will catch the get_dbutils failure or the cp failure + raise RuntimeError(f"Failed to stage file to DBFS. Error: {e}") + + # 4. Return the DBFS path (It's now a folder containing the CSV) + # The load function will look for base_path/csv_filename which will fail if cp was recursive. + # We need the base_path to be the DBFS folder path. + return dbfs_staging_path, csv_filename + +# --- The Test Function (Unchanged, assuming you fixed the name) --- +def test_load_csv_table_basic(spark: SparkSession, sample_csv_data): + """ + This test loads the data from the temporary DBFS staging location. + """ + base_path, csv_filename = sample_csv_data + df = load_csv_table(spark, base_path, csv_filename) + + # Assertions... + assert df.count() == 3 + + # Cleanup the staged data (Recommended) + try: + get_dbutils(spark).fs.rm(base_path, recurse=True) + except Exception as e: + # Don't fail the test on cleanup failure + print(f"Warning: Failed to clean up DBFS path {base_path}. Error: {e}") \ No newline at end of file diff --git a/tests/unit-tests/utils/test_loaders_4.py b/tests/unit-tests/utils/test_loaders_4.py new file mode 100644 index 0000000..a081739 --- /dev/null +++ b/tests/unit-tests/utils/test_loaders_4.py @@ -0,0 +1,319 @@ +"""Unit tests for data loading utilities""" + +# []() + +# qqqq TODO this is very AI generated run it past someone make it more relevant for data team +# need to review some examples and refactor todo qqqq +import pytest +import tempfile +import os +from pyspark.sql import SparkSession +from pyspark.conf import SparkConf +#from pyspark.sql.types import StringType, IntegerType +from src.utils.loaders import load_csv_table +from pathlib import Path +# Helper function for dbutils access (used for cleanup) +# def get_dbutils(spark: SparkSession): +# """Safely get dbutils for cleaning up files on DBFS.""" +# try: +# # Databricks Connect (the likely scenario for your tests) +# return spark.sparkContext._jvm.com.databricks.dbutils_v1.DBUtilsHolder.dbutils +# except Exception: +# # Fallback (for completeness, though Connect should be the priority) +# return None + +# @pytest.fixture +# def sample_csv_data(spark: SparkSession, tmp_path: Path): +# """ +# 1. Creates CSV data locally (using pytest's tmp_path). +# 2. Stages the data onto DBFS using the 'file://' protocol. +# 3. Cleans up the DBFS files afterward using dbutils. +# """ +# csv_filename = "test_data.csv" +# csv_file_local = tmp_path / csv_filename + +# # 1. Create CSV locally +# csv_content = """OrganisationID,Name,Status +# ORG001,Test Hospital,Active +# ORG002,Test Clinic,Inactive +# ORG003,Test Surgery,Active""" +# csv_file_local.write_text(csv_content) + +# # 2. Define the unique DBFS staging directory +# dbfs_dir_path = f"dbfs:/tmp/{tmp_path.name}/" + +# # --- Data Staging (The fix is here) --- +# print(f"\n[INFO] Staging data from LOCAL: file://{str(csv_file_local)} to REMOTE: {dbfs_dir_path}") + +# try: +# spark.read.format("csv") \ +# .option("header", "true") \ +# .option("inferSchema", "true") \ +# .load(f"file://{str(csv_file_local)}") \ +# .write.format("csv") \ +# .option("header", "true") \ +# .mode("overwrite") \ +# .save(dbfs_dir_path) +# except Exception as e: +# print(f"[ERROR] Spark write failed during staging: {e}") +# raise # Re-raise the exception to fail the test setup + +# # 3. Return the path to the test function (without the 'dbfs:' prefix) +# dbfs_base_path_for_load = f"/tmp/{tmp_path.name}/" + +# # --- YIELD (Test Execution) and CLEANUP (Teardown) --- +# try: +# yield dbfs_base_path_for_load, csv_filename +# finally: +# # 4. Clean up the files on DBFS +# print(f"\n[INFO] Starting DBFS cleanup for: {dbfs_dir_path}") +# try: +# dbutils = get_dbutils(spark) +# if dbutils: +# dbutils.fs.rm(dbfs_dir_path, True) +# print(f"[INFO] DBFS cleanup complete.") +# else: +# print("[WARN] Could not retrieve dbutils. Skipping DBFS cleanup.") +# except Exception as e: +# print(f"[WARN] Failed to clean up DBFS path {dbfs_dir_path}. Manual cleanup may be required. Error: {e}") + +# finds it but no permission +# @pytest.fixture +# def sample_csv_data(): +# """ +# Returns the paths needed to read the static CSV file located in the repo +# fixtures folder, ensuring the 'file://' prefix is used. + +# NOTE: Assumes you have created the file at: +# 'tests/unit-tests/utils/fixtures/org_data.csv' +# """ +# # --- Configuration (Adjust relative path if needed) --- +# csv_filename = "org_data.csv" +# fixture_dir = "tests/unit-tests/utils/fixtures" + +# # 1. Calculate the absolute path on the driver's local disk +# # This finds the file's true physical location +# absolute_base_path = Path(os.getcwd()) / fixture_dir + +# # 2. CRITICAL FIX: Add 'file://' prefix to override the DBFS assumption +# # Path must end with a slash for your function's concatenation to work. +# base_path_with_protocol = f"file://{str(absolute_base_path)}/" + +# return base_path_with_protocol, csv_filename +@pytest.fixture +def sample_csv_data(spark: SparkSession, tmp_path: Path): + """ + STAGES the static CSV file to a temporary DBFS location + to bypass the WorkspaceLocalFileSystem security restriction. + """ + # 1. Define Paths + csv_filename = "org_data.csv" + fixture_dir = "tests/unit-tests/utils/fixtures" + + # Path to the file on the local driver disk (where it currently lives) + local_base_path = Path(os.getcwd()) / fixture_dir + local_file_path = local_base_path / csv_filename + + # Define the TEMPORARY path on DBFS + # We use tmp_path name to ensure uniqueness, but put it on DBFS + dbfs_staging_path = f"dbfs:/tmp/pytest_data/{tmp_path.name}/" + dbfs_file_path = dbfs_staging_path + csv_filename + + # 2. Check and Write the CSV content locally (if you haven't done it yet) + csv_content = """OrganisationID,Name,Status +ORG001,Test Hospital,Active +ORG002,Test Clinic,Inactive +ORG003,Test Surgery,Active""" + + local_base_path.mkdir(parents=True, exist_ok=True) + local_file_path.write_text(csv_content) # Writes file to local driver disk + + # 3. CRITICAL STEP: COPY the file from Local Disk to DBFS + try: + # Access dbutils via SparkSession for Pytest compatibility + dbutils = spark.dbutils + # Source path MUST start with 'file:' prefix for dbutils.fs.cp to find the local file + dbutils.fs.cp(f"file://{str(local_file_path)}", dbfs_file_path, recurse=True) + except Exception as e: + # Cleanup DBFS folder even if copy failed + try: + dbutils.fs.rm(dbfs_staging_path, recurse=True) + except: + pass + raise RuntimeError(f"Failed to stage file to DBFS. Error: {e}") + + # 4. Return the DBFS path (No 'file://' needed, as it is now cloud storage) + # This base_path is now the DBFS staging folder path + return dbfs_staging_path, csv_filename + +# @pytest.fixture +# def sample_csv_data(spark: SparkSession, tmp_path: Path): +# """ +# Uses Spark to write mock CSV data to a temporary, local directory. +# This guarantees Spark can read it back during the test. +# """ +# # 1. Define the unique temporary folder path +# temp_folder_path = tmp_path / "test_csv_input" + +# # 2. Define the mock data +# data = [ +# ("ORG001", "Test Hospital", "Active"), +# ("ORG002", "Test Clinic", "Inactive"), +# ("ORG003", "Test Surgery", "Active") +# ] +# columns = ["OrganisationID", "Name", "Status"] + +# # 3. Write the DataFrame to the temporary folder using Spark +# spark.createDataFrame(data, columns).write.csv( +# path=str(temp_folder_path), +# header=True, +# mode="overwrite" +# ) + +# # 4. Return the path components +# # We return the folder path as base_path, and we can keep the original filename +# # for the test signature, even though the function ignores it now. +# return str(temp_folder_path) + "/", "test_data.csv" + +# @pytest.fixture +# def sample_csv_data(tmp_path): +# """Create a sample CSV file for testing""" +# # Create a temporary CSV file +# csv_file = tmp_path / "test_data.csv" +# csv_content = """OrganisationID,Name,Status +# ORG001,Test Hospital,Active +# ORG002,Test Clinic,Inactive +# ORG003,Test Surgery,Active""" + +# csv_file.write_text(csv_content) + +# return str(tmp_path) + "/", "test_data.csv" + + + + +def test_load_csv_table_basic(spark: SparkSession, sample_csv_data): + + + """Test that load_csv_table loads CSV correctly""" + base_path, csv_filename = sample_csv_data + + # Load the CSV + # temp solution + + df = load_csv_table(spark, base_path, csv_filename) + + # Verify the DataFrame was created + assert df is not None + + # Verify row count + assert df.count() == 3 + + # Verify columns exist + columns = df.columns + assert "OrganisationID" in columns + assert "Name" in columns + assert "Status" in columns + +# def get_spark() -> SparkSession: +# try: +# from databricks.connect import DatabricksSession +# return DatabricksSession.builder.serverless(True).getOrCreate() +# except ImportError: +# return SparkSession.builder.getOrCreate() + +# def get_spark() -> SparkSession: +# """Get Spark session - use local for testing""" +# return ( +# SparkSession.builder +# .master("local[*]") +# .appName("unit-tests") +# .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") +# .getOrCreate() +# ) +# def get_spark() -> SparkSession: +# """Get Spark session - use notebook's existing session""" +# return SparkSession.getActiveSession() or SparkSession.builder.getOrCreate() + +# def test_load_csv_table_data_content(spark, sample_csv_data): +# """Test that load_csv_table loads correct data""" +# base_path, csv_filename = sample_csv_data + +# # Load the CSV +# df = load_csv_table(base_path, csv_filename) + +# # Collect data and verify content +# rows = df.collect() + +# # Check first row +# assert rows[0]["OrganisationID"] == "ORG001" +# assert rows[0]["Name"] == "Test Hospital" +# assert rows[0]["Status"] == "Active" + +# # Check second row +# assert rows[1]["OrganisationID"] == "ORG002" +# assert rows[1]["Status"] == "Inactive" + + +# def test_load_csv_table_schema_inference(spark, sample_csv_data): +# """Test that schema inference works correctly""" +# base_path, csv_filename = sample_csv_data + +# # Load the CSV +# df = load_csv_table(base_path, csv_filename) + +# # Verify schema was inferred +# schema = df.schema +# assert len(schema.fields) == 3 + +# # All fields should be strings in this case +# for field in schema.fields: +# assert field.dataType == StringType() + + +# def test_load_csv_table_with_numeric_data(spark, tmp_path): +# """Test CSV loading with numeric columns""" +# # Create CSV with numeric data +# csv_file = tmp_path / "numeric_data.csv" +# csv_content = """ID,Count,Value +# 1,100,25.5 +# 2,200,30.75 +# 3,150,22.25""" + +# csv_file.write_text(csv_content) + +# # Load the CSV +# df = load_csv_table(str(tmp_path) + "/", "numeric_data.csv") + +# # Verify numeric types were inferred +# schema = df.schema +# id_field = [f for f in schema.fields if f.name == "ID"][0] +# count_field = [f for f in schema.fields if f.name == "Count"][0] +# value_field = [f for f in schema.fields if f.name == "Value"][0] + +# # Check that numeric columns were inferred as integers or doubles +# assert id_field.dataType == IntegerType() +# assert count_field.dataType == IntegerType() +# # Value field should be double since it has decimals +# assert "Double" in str(value_field.dataType) + + +# def test_load_csv_table_empty_file(spark, tmp_path): +# """Test handling of CSV with only headers""" +# csv_file = tmp_path / "empty_data.csv" +# csv_content = """OrganisationID,Name,Status""" + +# csv_file.write_text(csv_content) + +# # Load the CSV +# df = load_csv_table(str(tmp_path) + "/", "empty_data.csv") + +# # Should have columns but no rows +# assert len(df.columns) == 3 +# assert df.count() == 0 + + +# def test_load_csv_table_file_not_found(spark, tmp_path): +# """Test error handling when file doesn't exist""" +# with pytest.raises(Exception): +# load_csv_table(str(tmp_path) + "/", "nonexistent_file.csv") \ No newline at end of file From d24658b8bd40a805e1e4902765a5b9f4a2daab81 Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 23 Dec 2025 15:21:27 +0000 Subject: [PATCH 13/14] Added unit tests for a date transformation --- ...st.ipynb => notebook_better_secrets.ipynb} | 3 +- notebooks/run_working_days_example.ipynb | 75 ++++ .../utils/run_tests.py-worked} | 0 .../utils/test_loader_5.py-worked} | 0 notebooks/unit-tests/ReadMe.md | 2 - ...ntitled Notebook 2025-12-16 16_15_04.ipynb | 99 ------ .../utils/test_loader_5.py-dbutils-fail | 115 ------- pyproject.toml | 25 +- resources/pipeline/ods_ingestion.yml | 15 +- src/ingestion/ods_ingest.py | 98 +----- src/transformations/__init__.py | 2 + src/transformations/date_transforms.py | 69 ++++ tests/.gitinclude | 0 tests/README.md | 3 + tests/Run_Tests.ipynb | 215 ++++++++++++ tests/__init__.pynotapackagepath | 0 tests/conftest.py | 10 + tests/integration-tests/README.md | 43 +++ .../integration-tests/bronze/.gitkeep | 0 .../integration-tests/gold/(Clone) .gitkeep | 0 .../ingestion/(Clone) .gitkeep | 0 .../integration-tests/silver/(Clone) .gitkeep | 0 .../transformations/(Clone) .gitkeep | 0 tests/integration-tests/utils/test_loaders.py | 148 ++++++++ tests/unit-tests/__init__.pynope | 0 .../test_date_transformations.py | 228 +++++++++++++ .../utils/test_loaders2 fail.py-fail | 99 ------ tests/unit-tests/utils/test_loaders_4.py | 319 ------------------ 28 files changed, 834 insertions(+), 734 deletions(-) rename notebooks/{unit-tests/utils/notebook_integration_ingestion_test.ipynb => notebook_better_secrets.ipynb} (98%) create mode 100644 notebooks/run_working_days_example.ipynb rename notebooks/{unit-tests/utils/run_tests.py => unit-tests-del/utils/run_tests.py-worked} (100%) rename notebooks/{unit-tests/utils/test_loader_5.py => unit-tests-del/utils/test_loader_5.py-worked} (100%) delete mode 100644 notebooks/unit-tests/ReadMe.md delete mode 100644 notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb delete mode 100644 notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail create mode 100644 src/transformations/__init__.py create mode 100644 src/transformations/date_transforms.py delete mode 100644 tests/.gitinclude create mode 100644 tests/README.md create mode 100644 tests/Run_Tests.ipynb delete mode 100644 tests/__init__.pynotapackagepath create mode 100644 tests/conftest.py create mode 100644 tests/integration-tests/README.md rename notebooks/.gitinclude => tests/integration-tests/bronze/.gitkeep (100%) rename notebooks/__init__.py-nameclashdel => tests/integration-tests/gold/(Clone) .gitkeep (100%) rename notebooks/unit-tests/__init__.py-notimportingfrom => tests/integration-tests/ingestion/(Clone) .gitkeep (100%) rename notebooks/unit-tests/utils/__init__.pynotimportingfrom => tests/integration-tests/silver/(Clone) .gitkeep (100%) rename src/transformations/.gitinclude => tests/integration-tests/transformations/(Clone) .gitkeep (100%) create mode 100644 tests/integration-tests/utils/test_loaders.py delete mode 100644 tests/unit-tests/__init__.pynope create mode 100644 tests/unit-tests/transformations/test_date_transformations.py delete mode 100644 tests/unit-tests/utils/test_loaders2 fail.py-fail delete mode 100644 tests/unit-tests/utils/test_loaders_4.py diff --git a/notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb b/notebooks/notebook_better_secrets.ipynb similarity index 98% rename from notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb rename to notebooks/notebook_better_secrets.ipynb index 4fb4e24..757e515 100644 --- a/notebooks/unit-tests/utils/notebook_integration_ingestion_test.ipynb +++ b/notebooks/notebook_better_secrets.ipynb @@ -43,6 +43,7 @@ "source": [ "import sys, os\n", "# we need better routing\n", + "# toml providing it - only for test and once loaded will work until session refresh\n", "PROJECT_ROOT = \"/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src\"\n", "sys.path.insert(0, PROJECT_ROOT)\n", "\n", @@ -271,7 +272,7 @@ "notebookMetadata": { "pythonIndentUnit": 4 }, - "notebookName": "notebook_integration_ingestion_test", + "notebookName": "notebook_better_secrets", "widgets": {} }, "language_info": { diff --git a/notebooks/run_working_days_example.ipynb b/notebooks/run_working_days_example.ipynb new file mode 100644 index 0000000..9c2422e --- /dev/null +++ b/notebooks/run_working_days_example.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "39ad4a5b-480a-43a9-a61b-22286ded55a5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# This file is just to see another databricks projects function being used because i dont have the access to the global file the function has an added bit for making the file on the fly\n", + "\n", + "\n", + "# The only \"setup\" line you should need\n", + "import sys, os\n", + "# we need better routing\n", + "# toml providing it - only for test and once loaded will work until session refresh\n", + "PROJECT_ROOT = \"/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src\"\n", + "sys.path.insert(0, PROJECT_ROOT)\n", + "\n", + "from transformations import working_days_monthly\n", + "\n", + "# COMMAND ----------\n", + "# 2. Setup some dummy data to see if it works\n", + "data = [(\"2023-01-01\",), (\"2023-02-01\",)]\n", + "columns = [\"start_date\"]\n", + "sample_df = spark.createDataFrame(data, columns)\n", + "col_start_date = \"start_date\"\n", + "\n", + "\n", + "# COMMAND ----------\n", + "# 4. Run the function\n", + "# Notice we pass 'spark' (the global session) into the function\n", + "output_df = working_days_monthly(spark, sample_df, col_start_date)\n", + "\n", + "# COMMAND ----------\n", + "# 5. View the result\n", + "display(output_df)\n", + "\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "run_working_days_example", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/unit-tests/utils/run_tests.py b/notebooks/unit-tests-del/utils/run_tests.py-worked similarity index 100% rename from notebooks/unit-tests/utils/run_tests.py rename to notebooks/unit-tests-del/utils/run_tests.py-worked diff --git a/notebooks/unit-tests/utils/test_loader_5.py b/notebooks/unit-tests-del/utils/test_loader_5.py-worked similarity index 100% rename from notebooks/unit-tests/utils/test_loader_5.py rename to notebooks/unit-tests-del/utils/test_loader_5.py-worked diff --git a/notebooks/unit-tests/ReadMe.md b/notebooks/unit-tests/ReadMe.md deleted file mode 100644 index 3541e50..0000000 --- a/notebooks/unit-tests/ReadMe.md +++ /dev/null @@ -1,2 +0,0 @@ -# Thoughts -I am unsure if we would have layer based folders to mirror the unit tests here. it may depend on if we would just run the lot. Or if in fact we would use notebooks but would use some kind of built in test runner. diff --git a/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb b/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb deleted file mode 100644 index 163d1a1..0000000 --- a/notebooks/unit-tests/utils/Untitled Notebook 2025-12-16 16_15_04.ipynb +++ /dev/null @@ -1,99 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "317bf231-1e75-47ac-989b-49a2299cf416", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "# Trying out cells\n", - "\n", - "Not running pyspark successfully here\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "7bdae43d-b7da-419f-acdc-16606c7a5823", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "import pytest\n", - "from pyspark.sql import SparkSession" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "cdc41037-b32e-418d-9a15-99cf81241569", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "**Extracting Data**" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "4e4d6d9a-2d09-4710-8774-c2bc4d9717e9", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": { - "base_environment": "", - "environment_version": "4" - }, - "inputWidgetPreferences": null, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "Untitled Notebook 2025-12-16 16_15_04", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail b/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail deleted file mode 100644 index a91ce5f..0000000 --- a/notebooks/unit-tests/utils/test_loader_5.py-dbutils-fail +++ /dev/null @@ -1,115 +0,0 @@ -"""Pytest unit tests for data loading utilities""" - -import pytest -from pyspark.sql import SparkSession -import sys -import random - -# Routing -PROJECT_ROOT = "/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src" -sys.path.insert(0, PROJECT_ROOT) - -from utils import load_csv_table - - -@pytest.fixture(scope="session") -def spark(): - """Get the active Spark session (Databricks)""" - return SparkSession.getActiveSession() - - -@pytest.fixture -def mock_csv_data(spark): - """ - Create test CSV data in DBFS temp location. - Automatically cleans up after the test. - """ - # Create test data - data = [ - ("ORG001", "Test Hospital", "Active"), - ("ORG002", "Test Clinic", "Inactive"), - ("ORG003", "Test Surgery", "Active") - ] - columns = ["OrganisationID", "Name", "Status"] - - test_df = spark.createDataFrame(data, columns) - - # Use DBFS temp location with unique ID - test_id = random.randint(10000, 99999) - temp_path = f"/tmp/test_csv_{test_id}" - - # Write test data - test_df.coalesce(1).write.csv( - temp_path, - header=True, - mode="overwrite" - ) - - # Find and rename the CSV file - # qqqq butils in pytest or .py will not work because not a notebook - files = dbutils.fs.ls(temp_path) - csv_file = [f for f in files if f.name.startswith("part-") and f.name.endswith(".csv")][0] - new_path = f"{temp_path}/Contact_Details.csv" - dbutils.fs.mv(csv_file.path, new_path) - - # Yield the paths to the test - yield temp_path + "/", "Contact_Details.csv" - - # Cleanup after test - try: - dbutils.fs.rm(temp_path, recurse=True) - except Exception as e: - print(f"Warning: Failed to clean up {temp_path}: {e}") - - -def test_load_csv_table_basic(spark, mock_csv_data): - """Test that load_csv_table loads CSV correctly""" - base_path, csv_filename = mock_csv_data - - # Load the CSV - df = load_csv_table(spark, base_path, csv_filename) - - # Verify DataFrame was created - assert df is not None, "DataFrame should not be None" - - # Verify row count - assert df.count() == 3, f"Expected 3 rows, got {df.count()}" - - # Verify columns exist - columns = df.columns - assert "OrganisationID" in columns, "Missing OrganisationID column" - assert "Name" in columns, "Missing Name column" - assert "Status" in columns, "Missing Status column" - - # Verify data content - rows = df.collect() - assert rows[0]["OrganisationID"] == "ORG001", "First row OrganisationID incorrect" - assert rows[0]["Name"] == "Test Hospital", "First row Name incorrect" - assert rows[0]["Status"] == "Active", "First row Status incorrect" - - -def test_load_csv_table_column_names(spark, mock_csv_data): - """Test that column names are preserved correctly""" - base_path, csv_filename = mock_csv_data - - df = load_csv_table(spark, base_path, csv_filename) - - expected_columns = ["OrganisationID", "Name", "Status"] - assert df.columns == expected_columns, f"Columns don't match. Got: {df.columns}" - - -def test_load_csv_table_all_rows_loaded(spark, mock_csv_data): - """Test that all rows are loaded without data loss""" - base_path, csv_filename = mock_csv_data - - df = load_csv_table(spark, base_path, csv_filename) - - # Check all expected organisations are present - org_ids = [row["OrganisationID"] for row in df.collect()] - assert "ORG001" in org_ids - assert "ORG002" in org_ids - assert "ORG003" in org_ids - - -# 🚀 RUN THE TESTS NOW -pytest.main([__file__, "-v", "-s", "--tb=short"]) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2ee3f70..5a900ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,8 @@ dependencies = [ [dependency-groups] dev = [ - # "pytest", + "pytest", + "pytest-mock", # Very useful for mocking dbutils later "databricks-dlt", "databricks-connect>=15.4,<15.5", ] @@ -34,13 +35,15 @@ packages = ["src"] [tool.black] line-length = 125 -# will tis clash with conftest -#[tool.pytest.ini_options] -#minversion = "6.0" -#testpaths = ["tests/unit-tests"] -#pythonpath = ["src"] -#python_files = ["test_*.py"] -#addopts = [ -# "-v", -# "--tb=short" -#] +[tool.pytest.ini_options] +minversion = "6.0" +# This tells pytest to look in 'tests' by default +testpaths = ["tests"] +# This is the "Magic Fix" for routing. It adds 'src' to the path automatically! +pythonpath = ["src"] +python_files = ["test_*.py"] +addopts = [ + "-v", + "-s", + "--tb=short" +] diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml index 8a6a1ea..8a2ffb1 100644 --- a/resources/pipeline/ods_ingestion.yml +++ b/resources/pipeline/ods_ingestion.yml @@ -42,6 +42,9 @@ variables: layer: default: bronze description: bronze, silver, transfrormations etc + domain: + default: ods + description: pipeline per domain I expect x-bronze-config: &bronze-config @@ -56,6 +59,14 @@ resources: pipeline_ods_ingestion: name: ods_ingestion libraries: + ####### it seems without a wheel we cannot provide the other files + # This is your DLT entry point + #- file: + # path: ../../src/ingestion/ods_ingest.py + + # This makes 'import utils' possible. + # - glob: + # include: ../../src/utils/ - glob: # if doing a pipeline per layer would do something like # include: ../../src/ingestion/ - might work @@ -68,7 +79,7 @@ resources: channel: current # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table catalog: ${var.catalog} - target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here + target: ${var.schema_prefix}${var.layer}_${var.domain} serverless: true # qqqq dont think i need this here DELETE root_path: ../../src/ingestion # qqqq config is only at pipeline level use yml anchor points if need to reuse @@ -85,4 +96,4 @@ resources: # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") # configuration: <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs - pipeline.domain: ods # if we then want to apply per pipeline variable here \ No newline at end of file + pipeline.domain: ${var.domain} # if we then want to apply per pipeline variable here \ No newline at end of file diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py index c695702..cda8339 100644 --- a/src/ingestion/ods_ingest.py +++ b/src/ingestion/ods_ingest.py @@ -1,59 +1,21 @@ -# for making spark tables -# qqqq problem i am having is that we are setting the schema, and dev has schema set as user names -# i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script - -# This does work, but dont know how the file is working generally when bundle files are not generated anymore, so its not working from within the bundle. -# ultimately probably still want to be in a wheel will see what happens with unit testing it +# worked but improving pathing +# unless we set up a wheel artifact this is required to access utils import sys import os -# ai says This is an acceptable workaround for running code directly in Databricks pipelines/jobs when you haven't built a Python wheel. sys.path.append(os.path.abspath('..')) -### TRY ### -# import sys -# bundle_src_path = sys.argv[1] -# sys.path.append(bundle_src_path) from pyspark import pipelines as dp -#from utils.loaders import load_csv_table #use wheel instead from utils.loaders import load_csv_table -# Fixed System Constants these and some of this stuff should be going in a helper i think -#ADLS_PROTOCOL = "abfss://" -#ADLS_SUFFIX = ".dfs.core.windows.net" - -# 1. Get the Catalog name -# qqqq i dont think i want a default id prefer an error i think -# if set this in pipeline yml we wont need it -#catalog_name = spark.conf.get("bundle.catalog") - -# 2. Get the Schema Prefix (This is what changes between environments) -# In Dev, this will be the username. In Staging/Prod, it will be blank. -#schema_user_prefix = spark.conf.get("bundle.schema_prefix") -# this will often be a medallion layer but in src we also have transformations and ingestion so i think this mirror folders in source would be logical if team agrees qqqq -#schema_layer = "bronze_" -#schema_domain = "ods" #qqqq check what terminiology we want here - -# Construct the final schema name -#schema_name = (schema_user_prefix + schema_layer + schema_domain) -#print(schema_name) -# The container likely should mirror the layer name? -# container_layer ?? qqqq -#container = spark.conf.get("bundle.layer") # layer is bronze silver etc -# This likely should be dev staging prod -# storage_environment ?? qqqq -# wouldnt have default # storage_account = spark.conf.get("bundle.storage_account") # 'unifiedrptdeltalake' storage_container_path = spark.conf.get("pipeline.storage_container_path") -# In our storage our folders maybe should be domain based and if we thing this is manageable as hard rule this variable could be called domain_folder or similar qqqq -# domain_folder ?? qqqq folder_name = spark.conf.get("pipeline.domain") # ods -#folder_location_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{folder_name}/" folder_location_path = f"{storage_container_path}/{folder_name}/" -# "abfss://bronze@unifiedrptdeltalake.dfs.core.windows.net/ods + print(folder_location_path) -# qqqq this could be far simpler hardcode "import raw" the just list the names but we want it a bit more flexible + # List of table names TABLE_NAMES = [ @@ -82,7 +44,7 @@ } for table_name in TABLE_NAMES } - +# ALternatively if need to set the values specifically # ODS_TABLES = { # "Additional_Attributes_Details": { # "csv_filename": "Additional_Attributes_Details.csv", @@ -91,50 +53,9 @@ # "Code_System_Details": { # "csv_filename": "Code_System_Details.csv", # "comment": "Import raw Code_System_Details" -# }, -# "Contact_Details": { -# "csv_filename": "Contact_Details.csv", -# "comment": "Import raw Contact_Details" -# }, -# "Manifest_Details": { -# "csv_filename": "Manifest_Details.csv", -# "comment": "Import raw Manifest_Details" -# }, -# "Organisation_Details": { -# "csv_filename": "Organisation_Details.csv", -# "comment": "Import raw Organisation_Details" -# }, -# "OtherID_Details": { -# "csv_filename": "OtherID_Details.csv", -# "comment": "Import raw OtherID_Details" -# }, -# "PrimaryRole_Details": { -# "csv_filename": "PrimaryRole_Details.csv", -# "comment": "Import raw PrimaryRole_Details" -# }, -# "Relationship_Details": { -# "csv_filename": "Relationship_Details.csv", -# "comment": "Import raw Relationship_Details" -# }, -# "Role_Details": { -# "csv_filename": "Role_Details.csv", -# "comment": "Import raw Role_Details" -# }, -# "Successor_Details": { -# "csv_filename": "Successor_Details.csv", -# "comment": "Import raw Successor_Details" -# }, -# } +# } .... +# -## get from wheel -# def load_csv_table(base_path, csv_filename): -# """Load CSV from Azure storage with standard options""" -# return ( -# spark.read.format("csv") -# .option("header", "true") -# .option("inferSchema", "true") -# .load(f"{base_path}{csv_filename}") -# ) # Create DLT tables dynamically for table_name, config in ODS_TABLES.items(): @@ -149,6 +70,11 @@ def table_loader(): create_table() +####################################################################### +### DEL just keeping because poc +########################################################## + + # def load_csv_table(table_name): # """Load CSV from Azure storage with standard options""" # return ( diff --git a/src/transformations/__init__.py b/src/transformations/__init__.py new file mode 100644 index 0000000..cb3d90c --- /dev/null +++ b/src/transformations/__init__.py @@ -0,0 +1,2 @@ +from .date_transforms import working_days_monthly +__all__ = ["working_days_monthly"] \ No newline at end of file diff --git a/src/transformations/date_transforms.py b/src/transformations/date_transforms.py new file mode 100644 index 0000000..df566e7 --- /dev/null +++ b/src/transformations/date_transforms.py @@ -0,0 +1,69 @@ +####### THIS CODE IS GENUINCE FROM ANOTHER TEAM IVE JUST MADE SPARK DEPENDENCY INJECTED ######## +import os +from pyspark.sql import functions as fn # Added + +## def working_days_monthly(df, col_start_date): # Original +def working_days_monthly(spark, df, col_start_date): + """ + Description: Adds a column to monthly level data with working days, based on the first day of the month. + + Parameters: + df: the dataframe to perform the transformation on + col_start_date: the column that contains the first date of the month in your dataframe + + Returns: + output: the initial dataframe with the new column added with working days for the month + """ + + ## Added because i dont have the global file + calendar_full = "/tmp/standard_calendar.parquet" + + # 1. RELIABLE CHECK: Does the file exist? + file_exists = False + try: + dbutils.fs.ls(calendar_full) + file_exists = True + except: + file_exists = False + + # 2. GENERATE if missing + if not file_exists: + print(f"Creating missing calendar at {calendar_full}...") + spark.range(0, 365 * 10).select( + fn.expr("date_add('2020-01-01', cast(id as int))").alias("Date") + ).withColumn("Month_Start", fn.trunc("Date", "MM")) \ + .withColumn("Working_Day_Type", fn.when(fn.date_format("Date", "EEEE").isin("Saturday", "Sunday"), "N").otherwise("Y")) \ + .withColumn("Working_Day_Calc", fn.when(fn.col("Working_Day_Type") == "Y", 1).otherwise(0)) \ + .write.mode("overwrite").parquet(calendar_full) + + # 3. READ (This will now definitely work) + cal_df = spark.read.parquet(calendar_full) # This is a global file i dont have + + cal_df = cal_df.filter(fn.col("Working_Day_Type") == "Y").dropDuplicates() + + cal_df = cal_df.withColumn( + "Month_Start", + fn.date_format(fn.to_date(fn.col("Month_Start"), "yyyy-MM-dd"), "yyyy-MM-dd").cast("date") + ) + + # Aggregate by Month_Start and sum Working_Day_Calc + cal_df = cal_df.groupBy( + "Month_Start" + ).agg( + fn.sum("Working_Day_Calc").alias("Total_Working_Days") + ) + + # Convert start date in df to date format for comparison + df = df.withColumn(col_start_date, fn.to_date(fn.col(col_start_date).cast("date"))) + + # Create a new column with the total working days + output = df.join( + cal_df.select("Month_Start", "Total_Working_Days"), + fn.col(col_start_date) == fn.col("Month_Start"), + "left" + ).withColumn( + "working_days", + fn.coalesce(fn.col("Total_Working_Days"), fn.lit(0)) + ).drop("Month_Start", "Total_Working_Days") + + return output diff --git a/tests/.gitinclude b/tests/.gitinclude deleted file mode 100644 index e69de29..0000000 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..8b496d9 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,3 @@ +Tests live here +Run_Tests allows manual running for devs +Routes handled by conftest and toml diff --git a/tests/Run_Tests.ipynb b/tests/Run_Tests.ipynb new file mode 100644 index 0000000..b02ab2b --- /dev/null +++ b/tests/Run_Tests.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "60b0c695-bb33-4088-9cb9-a9a451ebc89b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Test Runner Notebook\n", + "\n", + "Tests can be triggered from here.\n", + "GitActions in future would want to on deploying dabs trigger integration tests and within gitactions run unit tests as part of pull request process." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8e7bb6bc-d6d2-4466-8db1-970dad0f3d30", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Test Env Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ff38cce8-1271-46d6-87f9-5c170b38fe45", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# MAGIC %pip install pytest\n", + "# because doesnt get pytest from toml???\n", + "import pytest\n", + "import sys\n", + "#import os\n", + "\n", + "# This looks 'up' one level from this notebook to find your project root\n", + "# using toml now\n", + "# repo_root = os.path.abspath('..') \n", + "# sys.path.append(f\"{repo_root}/src\")\n", + "\n", + "# Prevent __pycache__ creation\n", + "sys.dont_write_bytecode = True\n", + "\n", + "# print(f\"Project root set to: {repo_root}\")\n", + "print(\"Setup run\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c0b2b59b-7a20-43ac-becf-b1be7f487d68", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Unit Test Runner" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ae7b9eca-1b77-4b3f-bfb8-27abef298460", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "\n", + "# Run your test file\n", + "# This will run every file starting with 'test_' in that folder\n", + "pytest.main([\n", + " \"unit-tests\",\n", + " \"-v\", # verbose\n", + " \"-s\", # show print output\n", + " \"--tb=short\" # short tracebacks\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f5a4ed24-9c54-47e0-96ca-d6b5ae80a1b1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Integration Test Runner" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "818af375-e198-4799-b494-15d4f273f802", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Run your test file\n", + "# This will run every file starting with 'test_' in that folder\n", + "pytest.main([\n", + " \"integration-tests\",\n", + " \"-v\", # verbose\n", + " \"-s\", # show print output\n", + " \"--tb=short\" # short tracebacks\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb174373-6843-4d70-bc34-d95d47db7c2d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Run_Tests", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tests/__init__.pynotapackagepath b/tests/__init__.pynotapackagepath deleted file mode 100644 index e69de29..0000000 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..a3f9687 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +import pytest +from pyspark.sql import SparkSession + +# Use existing Spark session in Databricks (Spark Connect) +@pytest.fixture(scope="session") +def spark(): + session = SparkSession.getActiveSession() + if session is None: + raise RuntimeError("No active Spark session found. Ensure you are running in Databricks.") + return session \ No newline at end of file diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md new file mode 100644 index 0000000..27d48d6 --- /dev/null +++ b/tests/integration-tests/README.md @@ -0,0 +1,43 @@ +# To use this folder +- need proper pathing +- need toml and test config set up + +# Integration Tests + +These tests are not purely functional they interact with the databricks environment. +They will need dbutils spark and the file system. +We should write code to keep these concerns seperate where we can to enable clean testing + +Integration tests will be triggered by git action or can be triggered via a notebook. +They will run in databricks environment where as unit tests will be purely functional so can be run +via a notebook in databrick or as a gitaction within github. + +# Notes to help future implementation + +## A databricks job to be triggered on deploying a dab i expect +Just pseudo code +``` +bundle: + name: my-feature-tests + +resources: + jobs: + test_job: + name: "PyTest Runner" + tasks: + - task_key: run_pytest + new_cluster: + spark_version: "13.3.x-scala2.12" + node_type_id: "Standard_DS3_v2" + num_workers: 1 + notebook_task: + notebook_path: ./somepath/run_integration_tests.py + +``` +Something to add to the run tests py +``` +# Very important: Exit with the code from pytest +# so the GitHub Action knows if it failed or passed +if retcode != 0: + sys.exit(retcode) +``` \ No newline at end of file diff --git a/notebooks/.gitinclude b/tests/integration-tests/bronze/.gitkeep similarity index 100% rename from notebooks/.gitinclude rename to tests/integration-tests/bronze/.gitkeep diff --git a/notebooks/__init__.py-nameclashdel b/tests/integration-tests/gold/(Clone) .gitkeep similarity index 100% rename from notebooks/__init__.py-nameclashdel rename to tests/integration-tests/gold/(Clone) .gitkeep diff --git a/notebooks/unit-tests/__init__.py-notimportingfrom b/tests/integration-tests/ingestion/(Clone) .gitkeep similarity index 100% rename from notebooks/unit-tests/__init__.py-notimportingfrom rename to tests/integration-tests/ingestion/(Clone) .gitkeep diff --git a/notebooks/unit-tests/utils/__init__.pynotimportingfrom b/tests/integration-tests/silver/(Clone) .gitkeep similarity index 100% rename from notebooks/unit-tests/utils/__init__.pynotimportingfrom rename to tests/integration-tests/silver/(Clone) .gitkeep diff --git a/src/transformations/.gitinclude b/tests/integration-tests/transformations/(Clone) .gitkeep similarity index 100% rename from src/transformations/.gitinclude rename to tests/integration-tests/transformations/(Clone) .gitkeep diff --git a/tests/integration-tests/utils/test_loaders.py b/tests/integration-tests/utils/test_loaders.py new file mode 100644 index 0000000..a29277d --- /dev/null +++ b/tests/integration-tests/utils/test_loaders.py @@ -0,0 +1,148 @@ +import random +import pytest +from pathlib import Path +from pyspark.sql import SparkSession +import random +from pyspark.dbutils import DBUtils + + +############################### +## This is more integration now anyway i need a core function test for unit +## Spark and dbutils are not going to work in git +## Routing wont work in git actions +## Need to move things into testing config +## need to move thing into toml +## file needs to be in correct location +############################## + + +from utils import load_csv_table + + +###################################### +### PyTest doesnt have access to spark and dbutils in the same way so we are checking the test setup here ### +###################################### + +@pytest.fixture(scope="function") +def test_dbfs_setup(spark): + """ + Fixture to handle the setup and teardown of DBFS test data. + This keeps the actual test function clean. + """ + dbutils = DBUtils(spark) + test_id = random.randint(10000, 99999) + dbfs_folder = f"/tmp/test_csv_{test_id}" + filename = "Contact_Details.csv" + + # --- SETUP: Create a real CSV file --- + test_data = [ + ("ORG001", "Test Hospital", "Active"), + ("ORG002", "Test Clinic", "Inactive"), + ("ORG003", "Test Surgery", "Active") + ] + columns = ["OrganisationID", "Name", "Status"] + test_df = spark.createDataFrame(test_data, columns) + + # Write to temp then move to make it a 'proper' single CSV file + temp_path = f"{dbfs_folder}/temp" + test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(temp_path) + + part_file = [f.path for f in dbutils.fs.ls(temp_path) if f.path.endswith('.csv')][0] + final_path = f"{dbfs_folder}/{filename}" + dbutils.fs.cp(part_file, final_path) + + # Provide the path to the test + yield f"dbfs:{dbfs_folder}/", filename + + # --- TEARDOWN: Cleanup after test --- + dbutils.fs.rm(dbfs_folder, recurse=True) + +def test_load_csv_table_using_dbfs(spark, test_dbfs_setup): + """ + Test loading CSV from DBFS using the logic from src/utils. + """ + # Unpack the fixture values + base_path, filename = test_dbfs_setup + + # ACT: Run the actual function from your src/utils + df = load_csv_table(spark, base_path, filename) + + # ASSERT + assert df.count() == 3 + assert "OrganisationID" in df.columns + + first_row = df.collect()[0] + assert first_row["OrganisationID"] == "ORG001" + print(f"✅ Successfully verified {filename} at {base_path}") + +################### DEL below just in because poc ########################## + +# def test_load_csv_table_using_dbfs(spark): +# """ +# Test loading CSV from DBFS - THIS SHOULD WORK +# """ +# dbutils = DBUtils(spark) + +# # Create test data +# test_data = [ +# ("ORG001", "Test Hospital", "Active"), +# ("ORG002", "Test Clinic", "Inactive"), +# ("ORG003", "Test Surgery", "Active") +# ] +# columns = ["OrganisationID", "Name", "Status"] +# test_df = spark.createDataFrame(test_data, columns) + +# # Write to DBFS +# test_id = random.randint(10000, 99999) +# dbfs_path = f"/tmp/test_dbfs_{test_id}" +# filename = "Contact_Details.csv" + +# # Write using Spark (creates directory with part files) +# test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(dbfs_path) + +# print(f"✓ Wrote to DBFS: {dbfs_path}") + +# # Setup paths +# test_id = random.randint(10000, 99999) +# dbfs_folder = f"/tmp/test_dbfs_{test_id}" +# filename = "Contact_Details.csv" + +# # STEP 1: Write to temporary location (Spark creates part files) +# temp_write_path = f"{dbfs_folder}/temp_write" +# test_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(temp_write_path) + +# print(f"✓ Wrote to temp location: {temp_write_path}") + +# # STEP 2: Find the actual CSV part file that was created +# files = dbutils.fs.ls(temp_write_path) +# csv_part_file = [f.path for f in files if f.path.endswith('.csv')][0] + +# print(f"✓ Found part file: {csv_part_file}") + +# # STEP 3: Copy it to a proper filename +# final_csv_path = f"{dbfs_folder}/{filename}" +# dbutils.fs.cp(csv_part_file, final_csv_path) + +# print(f"✓ Copied to proper filename: {final_csv_path}") + +# # Test our function with DBFS path +# base_path = f"dbfs:{dbfs_folder}/" +# df = load_csv_table(spark, base_path, filename) +# # worked but its using partials files in a folder directory - i want to test with an actual full file +# # df = load_csv_table_copy(spark, base_path, "") # Empty filename since Spark reads the whole directory + +# print(f"✓ Loaded using: load_csv_table(spark, '{base_path}', '{filename}')") +# df.show() + +# # Assertions +# assert df.count() == 3, f"Expected 3 rows, got {df.count()}" +# assert "OrganisationID" in df.columns +# assert "Name" in df.columns +# assert "Status" in df.columns + +# first_row = df.collect()[0] +# assert first_row["OrganisationID"] == "ORG001" + +# # Cleanup +# dbutils.fs.rm(dbfs_path, recurse=True) +# print("✅ DBFS test PASSED!") \ No newline at end of file diff --git a/tests/unit-tests/__init__.pynope b/tests/unit-tests/__init__.pynope deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit-tests/transformations/test_date_transformations.py b/tests/unit-tests/transformations/test_date_transformations.py new file mode 100644 index 0000000..7b14627 --- /dev/null +++ b/tests/unit-tests/transformations/test_date_transformations.py @@ -0,0 +1,228 @@ +# test_working_days_monthly.py +import sys +import pytest +from pyspark.sql import SparkSession +from pyspark.sql import functions as fn + +# Project imports +# should be provided by toml +# PROJECT_ROOT = "/Workspace/Users/philip.tate@nhs.net/PT Separate Feature Branch/src" +# sys.path.insert(0, PROJECT_ROOT) + +from transformations import working_days_monthly + + +# ============================================================================ +# FIXTURES +# ============================================================================ +# should be automatic +# @pytest.fixture(scope="session") +# def spark(): +# """Provide Spark session for all tests""" +# session = SparkSession.getActiveSession() +# if session is None: +# raise RuntimeError("No active Spark session found. Running in Databricks?") +# return session + + +@pytest.fixture(scope="session") +def spark(): + """Provide Spark session for all tests""" + session = SparkSession.getActiveSession() + if session is None: + raise RuntimeError("No active Spark session found. Running in Databricks?") + return session + + +@pytest.fixture(scope="function") +def sample_dataframe_january_2023(spark): + """ + Fixture providing a simple test dataframe for January 2023 + """ + data = [("2023-01-01",)] + columns = ["start_date"] + return spark.createDataFrame(data, columns) + + +@pytest.fixture(scope="function") +def sample_dataframe_multiple_months(spark): + """ + Fixture providing a dataframe with multiple months + """ + data = [ + ("2023-01-01",), + ("2023-02-01",), + ("2023-03-01",), + ] + columns = ["start_date"] + return spark.createDataFrame(data, columns) + + +@pytest.fixture(scope="function") +def sample_dataframe_with_extra_columns(spark): + """ + Fixture providing a dataframe with additional columns beyond start_date + """ + data = [ + ("2023-01-01", "ORG001", 100), + ("2023-02-01", "ORG002", 200), + ] + columns = ["start_date", "org_id", "value"] + return spark.createDataFrame(data, columns) + + +# ============================================================================ +# TESTS +# ============================================================================ + +def test_working_days_adds_column(spark, sample_dataframe_january_2023): + """ + Test that working_days_monthly adds the working_days column + """ + # ACT + result_df = working_days_monthly(spark, sample_dataframe_january_2023, "start_date") + + # ASSERT + assert "working_days" in result_df.columns, "working_days column should be added" + assert result_df.count() == 1, f"Expected 1 row, got {result_df.count()}" + + print("✅ Column added successfully") + + +def test_working_days_january_2023_count(spark, sample_dataframe_january_2023): + """ + Test that January 2023 returns 22 working days + January 2023: 31 days - 4 Saturdays - 5 Sundays = 22 working days + """ + # ACT + result_df = working_days_monthly(spark, sample_dataframe_january_2023, "start_date") + + # ASSERT + working_days = result_df.collect()[0]["working_days"] + assert working_days == 22, f"January 2023 should have 22 working days, got {working_days}" + + print(f"✅ January 2023 verified: {working_days} working days") + + +def test_working_days_february_2023_count(spark): + """ + Test that February 2023 returns 20 working days + February 2023: 28 days - 4 Saturdays - 4 Sundays = 20 working days + """ + # ARRANGE + data = [("2023-02-01",)] + columns = ["start_date"] + input_df = spark.createDataFrame(data, columns) + + # ACT + result_df = working_days_monthly(spark, input_df, "start_date") + + # ASSERT + working_days = result_df.collect()[0]["working_days"] + assert working_days == 20, f"February 2023 should have 20 working days, got {working_days}" + + print(f"✅ February 2023 verified: {working_days} working days") + + +def test_working_days_multiple_months(spark, sample_dataframe_multiple_months): + """ + Test that function handles multiple months correctly + """ + # ACT + result_df = working_days_monthly(spark, sample_dataframe_multiple_months, "start_date") + + # ASSERT + assert result_df.count() == 3, "Should have 3 rows" + + # All rows should have positive working days + results = result_df.collect() + for row in results: + assert row["working_days"] > 0, f"All rows should have positive working days" + assert row["working_days"] <= 23, f"Working days should be reasonable" + + print("✅ Multiple months handled correctly") + + +def test_working_days_preserves_columns(spark, sample_dataframe_with_extra_columns): + """ + Test that function preserves existing columns + """ + # ACT + result_df = working_days_monthly(spark, sample_dataframe_with_extra_columns, "start_date") + + # ASSERT + assert "start_date" in result_df.columns, "Original start_date column should be preserved" + assert "org_id" in result_df.columns, "org_id column should be preserved" + assert "value" in result_df.columns, "value column should be preserved" + assert "working_days" in result_df.columns, "working_days column should be added" + + # Check data integrity + first_row = result_df.filter(fn.col("org_id") == "ORG001").collect()[0] + assert first_row["value"] == 100, "Original data should be preserved" + assert first_row["working_days"] > 0, "Working days should be calculated" + + print("✅ All columns preserved correctly") + + +def test_working_days_with_null_dates(spark): + """ + Test handling of null dates (edge case) + """ + # ARRANGE + data = [ + ("2023-01-01",), + (None,), + ("2023-02-01",), + ] + columns = ["start_date"] + input_df = spark.createDataFrame(data, columns) + + # ACT + result_df = working_days_monthly(spark, input_df, "start_date") + + # ASSERT + assert result_df.count() == 3, "Should handle null dates without dropping rows" + + # Null date should have 0 working days (due to coalesce in function) + null_row = result_df.filter(fn.col("start_date").isNull()).collect() + if null_row: + assert null_row[0]["working_days"] == 0, "Null dates should have 0 working days" + + print("✅ Null dates handled correctly") + + +def test_working_days_different_column_name(spark): + """ + Test with a different column name for start date + """ + # ARRANGE + data = [("2023-01-01",), ("2023-02-01",)] + columns = ["month_beginning"] # Different column name + input_df = spark.createDataFrame(data, columns) + + # ACT + result_df = working_days_monthly(spark, input_df, "month_beginning") + + # ASSERT + assert "working_days" in result_df.columns + assert result_df.count() == 2 + + print("✅ Different column name handled correctly") + + +def test_working_days_values_are_reasonable(spark, sample_dataframe_multiple_months): + """ + Test that all working days values are within reasonable bounds + No month should have more than 23 working days or less than 19 + """ + # ACT + result_df = working_days_monthly(spark, sample_dataframe_multiple_months, "start_date") + + # ASSERT + results = result_df.collect() + for row in results: + working_days = row["working_days"] + assert 19 <= working_days <= 23, \ + f"Working days should be between 19-23, got {working_days} for {row['start_date']}" + + print("✅ All working days values are reasonable") \ No newline at end of file diff --git a/tests/unit-tests/utils/test_loaders2 fail.py-fail b/tests/unit-tests/utils/test_loaders2 fail.py-fail deleted file mode 100644 index 3dd40d9..0000000 --- a/tests/unit-tests/utils/test_loaders2 fail.py-fail +++ /dev/null @@ -1,99 +0,0 @@ -import pytest -from pyspark.sql import SparkSession -from pathlib import Path -import os -import sys - -# --- HELPER FUNCTION TO GET DBUTILS (Required fix) --- -def get_dbutils(spark: SparkSession): - """ - Safely retrieves the dbutils object, necessary when spark.dbutils is not available - due to Spark Connect or other environment settings. - """ - try: - # Tries the standard Databricks notebook method - from pyspark.sql import SQLContext # Need this import for the notebook method - return SQLContext(spark.sparkContext).createDataFrame([('1', '2')], ['col1', 'col2']).sqlContext.sparkSession._jvm.com.databricks.dbutils.DBUtils.getDBUtils(spark.sparkContext) - except Exception: - # Fallback for alternative environments - try: - from IPython import get_ipython - return get_ipython().user_ns["dbutils"] - except: - # Final fallback, though unlikely to work if the above failed - raise Exception("Cannot find DBUtils object.") - -# --- Your Unmodified Production Function (Included for context) --- -def load_csv_table(spark, base_path, csv_filename): - return ( - spark.read.format("csv") - .option("header", "true") - .option("inferSchema", "true") - .load(f"{base_path}{csv_filename}") - ) - -# --- THE WORKING FIXTURE (Using the safe DBUtils getter) --- - -@pytest.fixture -def sample_csv_data(spark: SparkSession, tmp_path: Path): - """ - STAGES the static CSV file to a temporary DBFS location - to bypass the WorkspaceLocalFileSystem security restriction. - """ - # 1. Define Paths - csv_filename = "org_data.csv" - fixture_dir = "tests/unit-tests/utils/fixtures" - - local_base_path = Path(os.getcwd()) / fixture_dir - local_file_path = local_base_path / csv_filename - - # Define the TEMPORARY path on DBFS - # We use tmp_path name to ensure uniqueness - dbfs_staging_path = f"dbfs:/tmp/pytest_data/{tmp_path.name}/" - - # 2. Write the CSV content locally (if file doesn't exist) - csv_content = """OrganisationID,Name,Status -ORG001,Test Hospital,Active -ORG002,Test Clinic,Inactive -ORG003,Test Surgery,Active""" - - local_base_path.mkdir(parents=True, exist_ok=True) - local_file_path.write_text(csv_content) - - # 3. CRITICAL STEP: COPY the file from Local Disk to DBFS - try: - # FIX: Get dbutils using the helper function - dbutils = get_dbutils(spark) - - # Source path MUST start with 'file:' prefix for dbutils.fs.cp - local_source = f"file://{str(local_file_path)}" - - # Destination path is the folder on DBFS - dbutils.fs.cp(local_source, dbfs_staging_path, recurse=True) - - except Exception as e: - # This will catch the get_dbutils failure or the cp failure - raise RuntimeError(f"Failed to stage file to DBFS. Error: {e}") - - # 4. Return the DBFS path (It's now a folder containing the CSV) - # The load function will look for base_path/csv_filename which will fail if cp was recursive. - # We need the base_path to be the DBFS folder path. - return dbfs_staging_path, csv_filename - -# --- The Test Function (Unchanged, assuming you fixed the name) --- -def test_load_csv_table_basic(spark: SparkSession, sample_csv_data): - """ - This test loads the data from the temporary DBFS staging location. - """ - base_path, csv_filename = sample_csv_data - df = load_csv_table(spark, base_path, csv_filename) - - # Assertions... - assert df.count() == 3 - - # Cleanup the staged data (Recommended) - try: - get_dbutils(spark).fs.rm(base_path, recurse=True) - except Exception as e: - # Don't fail the test on cleanup failure - print(f"Warning: Failed to clean up DBFS path {base_path}. Error: {e}") \ No newline at end of file diff --git a/tests/unit-tests/utils/test_loaders_4.py b/tests/unit-tests/utils/test_loaders_4.py deleted file mode 100644 index a081739..0000000 --- a/tests/unit-tests/utils/test_loaders_4.py +++ /dev/null @@ -1,319 +0,0 @@ -"""Unit tests for data loading utilities""" - -# []() - -# qqqq TODO this is very AI generated run it past someone make it more relevant for data team -# need to review some examples and refactor todo qqqq -import pytest -import tempfile -import os -from pyspark.sql import SparkSession -from pyspark.conf import SparkConf -#from pyspark.sql.types import StringType, IntegerType -from src.utils.loaders import load_csv_table -from pathlib import Path -# Helper function for dbutils access (used for cleanup) -# def get_dbutils(spark: SparkSession): -# """Safely get dbutils for cleaning up files on DBFS.""" -# try: -# # Databricks Connect (the likely scenario for your tests) -# return spark.sparkContext._jvm.com.databricks.dbutils_v1.DBUtilsHolder.dbutils -# except Exception: -# # Fallback (for completeness, though Connect should be the priority) -# return None - -# @pytest.fixture -# def sample_csv_data(spark: SparkSession, tmp_path: Path): -# """ -# 1. Creates CSV data locally (using pytest's tmp_path). -# 2. Stages the data onto DBFS using the 'file://' protocol. -# 3. Cleans up the DBFS files afterward using dbutils. -# """ -# csv_filename = "test_data.csv" -# csv_file_local = tmp_path / csv_filename - -# # 1. Create CSV locally -# csv_content = """OrganisationID,Name,Status -# ORG001,Test Hospital,Active -# ORG002,Test Clinic,Inactive -# ORG003,Test Surgery,Active""" -# csv_file_local.write_text(csv_content) - -# # 2. Define the unique DBFS staging directory -# dbfs_dir_path = f"dbfs:/tmp/{tmp_path.name}/" - -# # --- Data Staging (The fix is here) --- -# print(f"\n[INFO] Staging data from LOCAL: file://{str(csv_file_local)} to REMOTE: {dbfs_dir_path}") - -# try: -# spark.read.format("csv") \ -# .option("header", "true") \ -# .option("inferSchema", "true") \ -# .load(f"file://{str(csv_file_local)}") \ -# .write.format("csv") \ -# .option("header", "true") \ -# .mode("overwrite") \ -# .save(dbfs_dir_path) -# except Exception as e: -# print(f"[ERROR] Spark write failed during staging: {e}") -# raise # Re-raise the exception to fail the test setup - -# # 3. Return the path to the test function (without the 'dbfs:' prefix) -# dbfs_base_path_for_load = f"/tmp/{tmp_path.name}/" - -# # --- YIELD (Test Execution) and CLEANUP (Teardown) --- -# try: -# yield dbfs_base_path_for_load, csv_filename -# finally: -# # 4. Clean up the files on DBFS -# print(f"\n[INFO] Starting DBFS cleanup for: {dbfs_dir_path}") -# try: -# dbutils = get_dbutils(spark) -# if dbutils: -# dbutils.fs.rm(dbfs_dir_path, True) -# print(f"[INFO] DBFS cleanup complete.") -# else: -# print("[WARN] Could not retrieve dbutils. Skipping DBFS cleanup.") -# except Exception as e: -# print(f"[WARN] Failed to clean up DBFS path {dbfs_dir_path}. Manual cleanup may be required. Error: {e}") - -# finds it but no permission -# @pytest.fixture -# def sample_csv_data(): -# """ -# Returns the paths needed to read the static CSV file located in the repo -# fixtures folder, ensuring the 'file://' prefix is used. - -# NOTE: Assumes you have created the file at: -# 'tests/unit-tests/utils/fixtures/org_data.csv' -# """ -# # --- Configuration (Adjust relative path if needed) --- -# csv_filename = "org_data.csv" -# fixture_dir = "tests/unit-tests/utils/fixtures" - -# # 1. Calculate the absolute path on the driver's local disk -# # This finds the file's true physical location -# absolute_base_path = Path(os.getcwd()) / fixture_dir - -# # 2. CRITICAL FIX: Add 'file://' prefix to override the DBFS assumption -# # Path must end with a slash for your function's concatenation to work. -# base_path_with_protocol = f"file://{str(absolute_base_path)}/" - -# return base_path_with_protocol, csv_filename -@pytest.fixture -def sample_csv_data(spark: SparkSession, tmp_path: Path): - """ - STAGES the static CSV file to a temporary DBFS location - to bypass the WorkspaceLocalFileSystem security restriction. - """ - # 1. Define Paths - csv_filename = "org_data.csv" - fixture_dir = "tests/unit-tests/utils/fixtures" - - # Path to the file on the local driver disk (where it currently lives) - local_base_path = Path(os.getcwd()) / fixture_dir - local_file_path = local_base_path / csv_filename - - # Define the TEMPORARY path on DBFS - # We use tmp_path name to ensure uniqueness, but put it on DBFS - dbfs_staging_path = f"dbfs:/tmp/pytest_data/{tmp_path.name}/" - dbfs_file_path = dbfs_staging_path + csv_filename - - # 2. Check and Write the CSV content locally (if you haven't done it yet) - csv_content = """OrganisationID,Name,Status -ORG001,Test Hospital,Active -ORG002,Test Clinic,Inactive -ORG003,Test Surgery,Active""" - - local_base_path.mkdir(parents=True, exist_ok=True) - local_file_path.write_text(csv_content) # Writes file to local driver disk - - # 3. CRITICAL STEP: COPY the file from Local Disk to DBFS - try: - # Access dbutils via SparkSession for Pytest compatibility - dbutils = spark.dbutils - # Source path MUST start with 'file:' prefix for dbutils.fs.cp to find the local file - dbutils.fs.cp(f"file://{str(local_file_path)}", dbfs_file_path, recurse=True) - except Exception as e: - # Cleanup DBFS folder even if copy failed - try: - dbutils.fs.rm(dbfs_staging_path, recurse=True) - except: - pass - raise RuntimeError(f"Failed to stage file to DBFS. Error: {e}") - - # 4. Return the DBFS path (No 'file://' needed, as it is now cloud storage) - # This base_path is now the DBFS staging folder path - return dbfs_staging_path, csv_filename - -# @pytest.fixture -# def sample_csv_data(spark: SparkSession, tmp_path: Path): -# """ -# Uses Spark to write mock CSV data to a temporary, local directory. -# This guarantees Spark can read it back during the test. -# """ -# # 1. Define the unique temporary folder path -# temp_folder_path = tmp_path / "test_csv_input" - -# # 2. Define the mock data -# data = [ -# ("ORG001", "Test Hospital", "Active"), -# ("ORG002", "Test Clinic", "Inactive"), -# ("ORG003", "Test Surgery", "Active") -# ] -# columns = ["OrganisationID", "Name", "Status"] - -# # 3. Write the DataFrame to the temporary folder using Spark -# spark.createDataFrame(data, columns).write.csv( -# path=str(temp_folder_path), -# header=True, -# mode="overwrite" -# ) - -# # 4. Return the path components -# # We return the folder path as base_path, and we can keep the original filename -# # for the test signature, even though the function ignores it now. -# return str(temp_folder_path) + "/", "test_data.csv" - -# @pytest.fixture -# def sample_csv_data(tmp_path): -# """Create a sample CSV file for testing""" -# # Create a temporary CSV file -# csv_file = tmp_path / "test_data.csv" -# csv_content = """OrganisationID,Name,Status -# ORG001,Test Hospital,Active -# ORG002,Test Clinic,Inactive -# ORG003,Test Surgery,Active""" - -# csv_file.write_text(csv_content) - -# return str(tmp_path) + "/", "test_data.csv" - - - - -def test_load_csv_table_basic(spark: SparkSession, sample_csv_data): - - - """Test that load_csv_table loads CSV correctly""" - base_path, csv_filename = sample_csv_data - - # Load the CSV - # temp solution - - df = load_csv_table(spark, base_path, csv_filename) - - # Verify the DataFrame was created - assert df is not None - - # Verify row count - assert df.count() == 3 - - # Verify columns exist - columns = df.columns - assert "OrganisationID" in columns - assert "Name" in columns - assert "Status" in columns - -# def get_spark() -> SparkSession: -# try: -# from databricks.connect import DatabricksSession -# return DatabricksSession.builder.serverless(True).getOrCreate() -# except ImportError: -# return SparkSession.builder.getOrCreate() - -# def get_spark() -> SparkSession: -# """Get Spark session - use local for testing""" -# return ( -# SparkSession.builder -# .master("local[*]") -# .appName("unit-tests") -# .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") -# .getOrCreate() -# ) -# def get_spark() -> SparkSession: -# """Get Spark session - use notebook's existing session""" -# return SparkSession.getActiveSession() or SparkSession.builder.getOrCreate() - -# def test_load_csv_table_data_content(spark, sample_csv_data): -# """Test that load_csv_table loads correct data""" -# base_path, csv_filename = sample_csv_data - -# # Load the CSV -# df = load_csv_table(base_path, csv_filename) - -# # Collect data and verify content -# rows = df.collect() - -# # Check first row -# assert rows[0]["OrganisationID"] == "ORG001" -# assert rows[0]["Name"] == "Test Hospital" -# assert rows[0]["Status"] == "Active" - -# # Check second row -# assert rows[1]["OrganisationID"] == "ORG002" -# assert rows[1]["Status"] == "Inactive" - - -# def test_load_csv_table_schema_inference(spark, sample_csv_data): -# """Test that schema inference works correctly""" -# base_path, csv_filename = sample_csv_data - -# # Load the CSV -# df = load_csv_table(base_path, csv_filename) - -# # Verify schema was inferred -# schema = df.schema -# assert len(schema.fields) == 3 - -# # All fields should be strings in this case -# for field in schema.fields: -# assert field.dataType == StringType() - - -# def test_load_csv_table_with_numeric_data(spark, tmp_path): -# """Test CSV loading with numeric columns""" -# # Create CSV with numeric data -# csv_file = tmp_path / "numeric_data.csv" -# csv_content = """ID,Count,Value -# 1,100,25.5 -# 2,200,30.75 -# 3,150,22.25""" - -# csv_file.write_text(csv_content) - -# # Load the CSV -# df = load_csv_table(str(tmp_path) + "/", "numeric_data.csv") - -# # Verify numeric types were inferred -# schema = df.schema -# id_field = [f for f in schema.fields if f.name == "ID"][0] -# count_field = [f for f in schema.fields if f.name == "Count"][0] -# value_field = [f for f in schema.fields if f.name == "Value"][0] - -# # Check that numeric columns were inferred as integers or doubles -# assert id_field.dataType == IntegerType() -# assert count_field.dataType == IntegerType() -# # Value field should be double since it has decimals -# assert "Double" in str(value_field.dataType) - - -# def test_load_csv_table_empty_file(spark, tmp_path): -# """Test handling of CSV with only headers""" -# csv_file = tmp_path / "empty_data.csv" -# csv_content = """OrganisationID,Name,Status""" - -# csv_file.write_text(csv_content) - -# # Load the CSV -# df = load_csv_table(str(tmp_path) + "/", "empty_data.csv") - -# # Should have columns but no rows -# assert len(df.columns) == 3 -# assert df.count() == 0 - - -# def test_load_csv_table_file_not_found(spark, tmp_path): -# """Test error handling when file doesn't exist""" -# with pytest.raises(Exception): -# load_csv_table(str(tmp_path) + "/", "nonexistent_file.csv") \ No newline at end of file From 03209c27791478d2c3775e80333c1e6546e4947e Mon Sep 17 00:00:00 2001 From: Phil-NHS Date: Tue, 23 Dec 2025 16:15:15 +0000 Subject: [PATCH 14/14] try a manual action --- .github/workflows/manual-trigger-test-poc.yml | 29 ++++++++++++++++ pyproject.toml | 3 ++ requirements-dev.txt | 1 + tests/conftest.py | 34 ++++++++++++++++--- .../test_date_transformations.py | 3 +- 5 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/manual-trigger-test-poc.yml create mode 100644 requirements-dev.txt diff --git a/.github/workflows/manual-trigger-test-poc.yml b/.github/workflows/manual-trigger-test-poc.yml new file mode 100644 index 0000000..b02d834 --- /dev/null +++ b/.github/workflows/manual-trigger-test-poc.yml @@ -0,0 +1,29 @@ +name: Manual test run (PoC) + +on: + workflow_dispatch: + +jobs: + pytest: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Upgrade pip + run: python -m pip install --upgrade pip + + - name: Install project + test deps + run: | + pip install -r requirements-dev.txt + pip install -e . + + - name: Run pytest (exclude Databricks tests) + run: | + pytest -m "not databricks" -v \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 5a900ed..ad75182 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,3 +47,6 @@ addopts = [ "-s", "--tb=short" ] +markers = [ + "databricks: for example we might use this marker to exclude a specific unit test from git env test running so it only run in dbx env", +] \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..55b033e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pytest \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index a3f9687..8da9d81 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,36 @@ import pytest from pyspark.sql import SparkSession -# Use existing Spark session in Databricks (Spark Connect) + @pytest.fixture(scope="session") def spark(): + # Databricks / Spark Connect session = SparkSession.getActiveSession() - if session is None: - raise RuntimeError("No active Spark session found. Ensure you are running in Databricks.") - return session \ No newline at end of file + if session is not None: + yield session + return + + # Local / CI Spark + spark = ( + SparkSession.builder + .master("local[1]") + .appName("pytest-pyspark") + .config("spark.ui.enabled", "false") + .getOrCreate() + ) + + yield spark + spark.stop() + + +### Worked but want something that can work with cicd +# import pytest +# from pyspark.sql import SparkSession + +# # Use existing Spark session in Databricks (Spark Connect) +# @pytest.fixture(scope="session") +# def spark(): +# session = SparkSession.getActiveSession() +# if session is None: +# raise RuntimeError("No active Spark session found. Ensure you are running in Databricks.") +# return session \ No newline at end of file diff --git a/tests/unit-tests/transformations/test_date_transformations.py b/tests/unit-tests/transformations/test_date_transformations.py index 7b14627..9888c50 100644 --- a/tests/unit-tests/transformations/test_date_transformations.py +++ b/tests/unit-tests/transformations/test_date_transformations.py @@ -103,7 +103,8 @@ def test_working_days_january_2023_count(spark, sample_dataframe_january_2023): print(f"✅ January 2023 verified: {working_days} working days") - +# Just trying out pytest.mark so can exclude by run: pytest -m "not databricks" but the intention would be unit tests via github action and integration by github action triggering test in databricks environment +@pytest.mark.databricks def test_working_days_february_2023_count(spark): """ Test that February 2023 returns 20 working days