Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download wheel dependency locally to register it to the dependency graph #1704

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,11 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) ->
local_file.write_bytes(remote_file.read())
yield from graph.register_library(local_file.as_posix())
if library.whl:
# TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path
# TODO: https://github.com/databrickslabs/ucx/issues/1640
yield DependencyProblem("not-yet-implemented", "Wheel library is not yet implemented")
with self._ws.workspace.download(library.whl, format=ExportFormat.AUTO) as remote_file:
with tempfile.TemporaryDirectory() as directory:
local_file = Path(directory) / Path(library.whl).name
local_file.write_bytes(remote_file.read())
yield from graph.register_library(local_file.as_posix())
if library.requirements: # https://pip.pypa.io/en/stable/reference/requirements-file-format/
logger.info(f"Registering libraries from {library.requirements}")
with self._ws.workspace.download(library.requirements, format=ExportFormat.AUTO) as remote_file:
Expand Down
65 changes: 62 additions & 3 deletions tests/integration/source_code/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from datetime import timedelta
from io import StringIO
from pathlib import Path
from unittest.mock import create_autospec

from databricks.labs.blueprint.tui import Prompts
from databricks.sdk.errors import NotFound
from databricks.sdk.retries import retried
from databricks.sdk.service.compute import Library, PythonPyPiLibrary
from databricks.sdk.service.workspace import ImportFormat

from databricks.labs.blueprint.tui import Prompts

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.known import UNKNOWN, Whitelist
from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.path_lookup import PathLookup
Expand All @@ -21,7 +22,12 @@


@retried(on=[NotFound], timeout=timedelta(minutes=2))
def test_running_real_workflow_linter_job(installation_ctx):
def test_running_real_workflow_linter_job(installation_ctx, make_notebook, make_directory, make_job):
# Deprecated file system path in call to: /mnt/things/e/f/g
lint_problem = b"display(spark.read.csv('/mnt/things/e/f/g'))"
notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=lint_problem)
make_job(notebook_path=notebook)

ctx = installation_ctx
ctx.workspace_installation.run()
ctx.deployed_workflows.run_workflow("experimental-workflow-linter")
Expand Down Expand Up @@ -239,3 +245,56 @@ def test_workflow_linter_lints_job_with_egg_dependency(
problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0


def test_workflow_linter_lints_job_with_missing_library(
simple_ctx,
ws,
make_job,
make_notebook,
make_random,
make_directory,
):
expected_problem_message = "Could not locate import: databricks.labs.ucx"
whitelist = create_autospec(Whitelist) # databricks is in default list
whitelist.module_compatibility.return_value = UNKNOWN

simple_ctx = simple_ctx.replace(
whitelist=whitelist,
path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding current project
)

notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx")
job_without_ucx_library = make_job(notebook_path=notebook)

problems = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) > 0
whitelist.module_compatibility.assert_called_once_with("databricks.labs.ucx")


def test_workflow_linter_lints_job_with_wheel_dependency(
simple_ctx,
ws,
make_job,
make_notebook,
make_random,
make_directory,
):
expected_problem_message = "Could not locate import: databricks.labs.ucx"

simple_ctx = simple_ctx.replace(
whitelist=Whitelist(), # databricks is in default list
path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding current project
)

simple_ctx.workspace_installation.run() # Creates ucx wheel
wheels = [file for file in simple_ctx.installation.files() if file.path.endswith(".whl")]
library = compute.Library(whl=wheels[0].path)

notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=b"import databricks.labs.ucx")
job_with_ucx_library = make_job(notebook_path=notebook, libraries=[library])

problems = simple_ctx.workflow_linter.lint_job(job_with_ucx_library.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0
23 changes: 20 additions & 3 deletions tests/unit/source_code/test_jobs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import io
import logging
from pathlib import Path
from unittest.mock import create_autospec

Expand Down Expand Up @@ -49,13 +49,13 @@ def graph(mock_path_lookup, dependency_resolver) -> DependencyGraph:
def test_workflow_task_container_builds_dependency_graph_not_yet_implemented(mock_path_lookup, graph):
# Goal of test is to raise test coverage, remove after implementing
ws = create_autospec(WorkspaceClient)
library = compute.Library(jar="library.jar", whl="library.whl")
library = compute.Library(jar="library.jar")
task = jobs.Task(task_key="test", libraries=[library], existing_cluster_id="id")

workflow_task_container = WorkflowTaskContainer(ws, task)
problems = workflow_task_container.build_dependency_graph(graph)

assert len(problems) == 2
assert len(problems) == 1
assert all(problem.code == "not-yet-implemented" for problem in problems)
ws.assert_not_called()

Expand Down Expand Up @@ -99,6 +99,23 @@ def test_workflow_task_container_builds_dependency_graph_unknown_pypi_library(mo
ws.assert_not_called()


def test_workflow_task_container_builds_dependency_graph_for_python_wheel(mock_path_lookup, graph):
ws = create_autospec(WorkspaceClient)
ws.workspace.download.return_value = io.BytesIO(b"test")

libraries = [compute.Library(whl="test.whl")]
task = jobs.Task(task_key="test", libraries=libraries)

workflow_task_container = WorkflowTaskContainer(ws, task)
problems = workflow_task_container.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message.startswith("Failed to install")
assert mock_path_lookup.resolve(Path("test")) is None
ws.assert_not_called()


def test_workflow_linter_lint_job_logs_problems(dependency_resolver, mock_path_lookup, empty_index, caplog):
expected_message = "Found job problems:\nUNKNOWN:-1 [library-install-failed] Failed to install unknown-library"

Expand Down
Loading