diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 63b7205..fba5fa1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,56 +1,56 @@ -# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing -# such as linting, and unit test for python, maybe dab? verify -# we run these on all pull request because if there is a hot fix it may not have passed through -# staging for example -# qqqq check this is up to date -name: CI - Pull Request Checks - -# Run CI on all pull requests -on: - pull_request: - branches: - - '**' # all branches - -jobs: - ci_checks: - name: "Linting, Unit Tests, DAB Verify" - runs-on: ubuntu-latest - - steps: - # Checkout code - - name: Checkout repository - uses: actions/checkout@v4 - - # Set up Python - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - # Install dependencies used for linting and unit tests - - name: Install dependencies - run: pip install -r requirements-dev.txt - - # Run python unit tests - - name: Run Unit Tests - run: pytest tests/unit - - # Run python lint - # qqqq on example used flake8 instead - # pyproject.toml will need configuring - - name: Run Linting - run: pylint src - - # qqqq to do run commit lint step and put in commit lint config - # see TELBlazor - - name: Commit lint - run: | - echo "Commit lint not implemented" - exit 1 - - # qqqq to do run version generation step and put in commit lint config - # see TELBlazor - - name: Version Generation Test Run - run: | - echo "Version test run not implemented" - exit 1 +# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing +# such as linting, and unit test for python, maybe dab? verify +# we run these on all pull request because if there is a hot fix it may not have passed through +# staging for example +# qqqq check this is up to date +name: CI - Pull Request Checks + +# Run CI on all pull requests +on: + pull_request: + branches: + - '**' # all branches + +jobs: + ci_checks: + name: "Linting, Unit Tests, DAB Verify" + runs-on: ubuntu-latest + + steps: + # Checkout code + - name: Checkout repository + uses: actions/checkout@v4 + + # Set up Python + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + # Install dependencies used for linting and unit tests + - name: Install dependencies + run: pip install -r requirements-dev.txt + + # Run python unit tests + - name: Run Unit Tests + run: pytest tests/unit + + # Run python lint + # qqqq on example used flake8 instead + # pyproject.toml will need configuring + - name: Run Linting + run: pylint src + + # qqqq to do run commit lint step and put in commit lint config + # see TELBlazor + - name: Commit lint + run: | + echo "Commit lint not implemented" + exit 1 + + # qqqq to do run version generation step and put in commit lint config + # see TELBlazor + - name: Version Generation Test Run + run: | + echo "Version test run not implemented" + exit 1 diff --git a/devops/README.md b/devops/README.md new file mode 100644 index 0000000..b4ee10e --- /dev/null +++ b/devops/README.md @@ -0,0 +1,5 @@ +# Development Deployment + +It would be nice without the terminal and without needing to push to github to trigger unit tests, bundle validation, and bundle deployment for the local development user areas. + +This doesnt seem do-able with a notebook, and enabling the terminal is an option, so using the databrick.yml ui deploy, and remembering to triggered any unit tests seems like it will be the process for now. \ No newline at end of file diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml index 2685775..7c91c29 100644 --- a/resources/pipeline/ods_ingestion.yml +++ b/resources/pipeline/ods_ingestion.yml @@ -1,86 +1,86 @@ -############################### -## POC notes - DELETE LATER -############################### -## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense -## We will not define schemas here -## We use this file to expose from databricks.yml the variables we need to set up the pipeline -## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here -############################### -## If we want specific pipeline resource file per .py file we should use this i think - # libraries: - # - notebook: - # path: ../../src/ingestion/ods_ingest.py -## if we want per layer maybe - # libraries: - # - glob: - # # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/**.py -## if we want per domain maybe - # libraries: - # - glob: - # # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/ods_*.py -############################### - -# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach -##### -# If we are running multlipe pipelines we may define all their vars at the top -##### - - -# qqqq -## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml -## bundle for vars originating from databricks.ymly -### i get vars from databricks -## pipeline. from pipeline files -## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from - -## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but -# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. -# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. -variables: - layer: - default: bronze - description: bronze, silver, transfrormations etc - - -x-bronze-config: &bronze-config - bundle.env_name: ${var.env_name} - bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml - pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var - # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" - pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" - -resources: - pipelines: - pipeline_ods_ingestion: - name: ods_ingestion - libraries: - - glob: - # if doing a pipeline per layer would do something like - # include: ../../src/ingestion/ - might work - # include: ../../src/ingestion/*.py - doesnt work - include: ../../src/ingestion/ods_ingest.py - photon: true - # qqqq good practice to specify its something to do with dlt having beta version? - channel: current - # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table - catalog: ${var.catalog} - target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here - serverless: true - # qqqq dont think i need this here DELETE root_path: ../../src/ingestion - # qqqq config is only at pipeline level use yml anchor points if need to reuse - configuration: - ################ Map Databricks Bundle variables to Spark Config Properties ################ - # Map the Bundle variables (from databricks.yml) to Spark config properties - # The key names here MUST match what you use in spark.conf.get() in Python! - # bundle.env_name: ${var.env_name} - # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml - # bundle.storage_account: ${var.storage_account} - ############### Resource yml files for set of pipelines ################# - # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here - # for example this would be - # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") - # configuration: - <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs +############################### +## POC notes - DELETE LATER +############################### +## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense +## We will not define schemas here +## We use this file to expose from databricks.yml the variables we need to set up the pipeline +## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here +############################### +## If we want specific pipeline resource file per .py file we should use this i think + # libraries: + # - notebook: + # path: ../../src/ingestion/ods_ingest.py +## if we want per layer maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/**.py +## if we want per domain maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ods_*.py +############################### + +# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach +##### +# If we are running multlipe pipelines we may define all their vars at the top +##### + + +# qqqq +## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml +## bundle for vars originating from databricks.ymly +### i get vars from databricks +## pipeline. from pipeline files +## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from + +## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but +# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. +# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. +variables: + layer: + default: bronze + description: bronze, silver, transfrormations etc + + +x-bronze-config: &bronze-config + bundle.env_name: ${var.env_name} + bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml + pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var + # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" + pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" + +resources: + pipelines: + pipeline_ods_ingestion: + name: ods_ingestion + libraries: + - glob: + # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ - might work + # include: ../../src/ingestion/*.py - doesnt work + include: ../../src/ingestion/ods_ingest.py + photon: true + # qqqq good practice to specify its something to do with dlt having beta version? + channel: current + # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table + catalog: ${var.catalog} + target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here + serverless: true + # qqqq dont think i need this here DELETE root_path: ../../src/ingestion + # qqqq config is only at pipeline level use yml anchor points if need to reuse + configuration: + ################ Map Databricks Bundle variables to Spark Config Properties ################ + # Map the Bundle variables (from databricks.yml) to Spark config properties + # The key names here MUST match what you use in spark.conf.get() in Python! + # bundle.env_name: ${var.env_name} + # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml + # bundle.storage_account: ${var.storage_account} + ############### Resource yml files for set of pipelines ################# + # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here + # for example this would be + # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") + # configuration: + <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs pipeline.domain: ods # if we then want to apply per pipeline variable here \ No newline at end of file