Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In dependency graph, support dependencies other than notebooks #1343

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
e47b965
add test util for loading local source
ericvergnaud Apr 4, 2024
a0c1e16
add Notebook samples
ericvergnaud Apr 4, 2024
a2808ce
rename sample to avoid fmt errors
ericvergnaud Apr 4, 2024
4066042
allow loading local notebook samples
ericvergnaud Apr 4, 2024
12e075c
reads cells from notebook samples
ericvergnaud Apr 4, 2024
ee88135
fix pylint issues
ericvergnaud Apr 4, 2024
c428e18
uncomment
ericvergnaud Apr 4, 2024
c7e0065
ensure notebook is rebuilt properly after migration
ericvergnaud Apr 4, 2024
d0ce635
builds leaf dependency graph
ericvergnaud Apr 4, 2024
38e18cc
add support for RUN cells
ericvergnaud Apr 4, 2024
c3c160e
add samples for dependency graph
ericvergnaud Apr 4, 2024
9fd6a11
builds dependency graph of depth 1 in %run cells
ericvergnaud Apr 5, 2024
8e36094
builds a multi-level dependency graph
ericvergnaud Apr 5, 2024
cece88d
caters for duplicates
ericvergnaud Apr 5, 2024
3a39712
support cyclical dependencies
ericvergnaud Apr 5, 2024
63b1d82
fix format
ericvergnaud Apr 5, 2024
2aaedf8
Merge branch 'main' into notebook-dependency-graph
ericvergnaud Apr 5, 2024
946d998
fix failing test - source_code dir contains non-python files
ericvergnaud Apr 5, 2024
da61c2b
fix pylint issue
ericvergnaud Apr 5, 2024
b43dd5f
fix failing tests
ericvergnaud Apr 5, 2024
81fffd3
rename notebooks to NotebookMigrator
ericvergnaud Apr 5, 2024
6cdcfd4
align naming
ericvergnaud Apr 5, 2024
80c07a5
Merge branch 'main' into detect-dbutils.notebook.run
ericvergnaud Apr 5, 2024
59d1ece
drop renamed samples
ericvergnaud Apr 5, 2024
fa785fe
make cell code runnable
ericvergnaud Apr 5, 2024
036d5fc
add sample
ericvergnaud Apr 8, 2024
8796eec
convert to python 3
ericvergnaud Apr 8, 2024
ac13e63
more tests
ericvergnaud Apr 8, 2024
c6aaa2e
locates calls to dbutils.notebook.run
ericvergnaud Apr 8, 2024
94fc41c
more tests
ericvergnaud Apr 8, 2024
cd58d7a
Update src/databricks/labs/ucx/source_code/astlinter.py
ericvergnaud Apr 8, 2024
f0770af
Update src/databricks/labs/ucx/source_code/notebook.py
ericvergnaud Apr 8, 2024
12c8e30
add notebook samples
ericvergnaud Apr 8, 2024
33495a1
refactor for supporting both linting and dependency detection
ericvergnaud Apr 8, 2024
c005c46
raise distinct Advisory for manual vs automated migration
ericvergnaud Apr 8, 2024
1f94b3c
use correct Advice codes
ericvergnaud Apr 8, 2024
cb253e5
integrate code
ericvergnaud Apr 8, 2024
bcd320c
formatting
ericvergnaud Apr 9, 2024
ea181b4
address code review comments
ericvergnaud Apr 9, 2024
1ddd452
change model to avoid clashing ownership patterns
ericvergnaud Apr 9, 2024
06256eb
remove unused
ericvergnaud Apr 9, 2024
82cc2ec
fix failing tests
ericvergnaud Apr 9, 2024
fa6a8cf
move PythonLinter to python_linter file renamed from astlinter
ericvergnaud Apr 9, 2024
967f956
address code review comments
ericvergnaud Apr 9, 2024
e1fcb4f
Merge branch 'main' into detect-dbutils.notebook.run
ericvergnaud Apr 9, 2024
0f15ef2
fix formatting
ericvergnaud Apr 9, 2024
d7204f3
improve test code coverage and quality
ericvergnaud Apr 9, 2024
5cb1744
build dependency graph as a top level function
ericvergnaud Apr 9, 2024
941cb41
Merge branch 'detect-dbutils.notebook.run' into integrate-detection-o…
ericvergnaud Apr 9, 2024
53e85ec
support all dependencies, not just notebooks
ericvergnaud Apr 10, 2024
e9e8cab
fixing tests - not done
ericvergnaud Apr 10, 2024
e050050
Merge branch 'main' into detect-import-dependencies
ericvergnaud Apr 10, 2024
5732a40
fix failing tests
ericvergnaud Apr 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions src/databricks/labs/ucx/source_code/dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from __future__ import annotations

import abc
from collections.abc import Callable

from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat
from databricks.sdk import WorkspaceClient


class Dependency:

@staticmethod
def from_object_info(object_info: ObjectInfo):
assert object_info.path is not None
return Dependency(object_info.object_type, object_info.path)

def __init__(self, object_type: ObjectType | None, path: str):
self._type = object_type
self._path = path

@property
def type(self) -> ObjectType | None:
return self._type

@property
def path(self) -> str:
return self._path

def __hash__(self):
return hash(self.path)

def __eq__(self, other):
return isinstance(other, Dependency) and self.path == other.path


class SourceContainer(abc.ABC):

@property
@abc.abstractmethod
def object_type(self) -> ObjectType:
raise NotImplementedError()

@abc.abstractmethod
def build_dependency_graph(self, graph) -> None:
raise NotImplementedError()


class DependencyLoader:

def __init__(self, ws: WorkspaceClient):
self._ws = ws

def load_dependency(self, dependency: Dependency) -> SourceContainer | None:
object_info = self._load_object(dependency)
if object_info.object_type is ObjectType.NOTEBOOK:
return self._load_notebook(object_info)
raise NotImplementedError(str(object_info.object_type))

def _load_object(self, dependency: Dependency) -> ObjectInfo:
result = self._ws.workspace.list(dependency.path)
object_info = next((oi for oi in result), None)
if object_info is None:
raise ValueError(f"Could not locate object at '{dependency.path}'")
if dependency.type is not None and object_info.object_type is not dependency.type:
raise ValueError(
f"Invalid object at '{dependency.path}', expected a {str(dependency.type)}, got a {str(object_info.object_type)}"
)
return object_info

def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer:
# local import to avoid circular dependency
# pylint: disable=import-outside-toplevel
from databricks.labs.ucx.source_code.notebook import Notebook

assert object_info.path is not None
assert object_info.language is not None
source = self._load_source(object_info)
return Notebook.parse(object_info.path, source, object_info.language)

def _load_source(self, object_info: ObjectInfo) -> str:
if not object_info.language or not object_info.path:
raise ValueError(f"Invalid ObjectInfo: {object_info}")
with self._ws.workspace.download(object_info.path, format=ExportFormat.SOURCE) as f:
return f.read().decode("utf-8")


class DependencyGraph:

def __init__(self, dependency: Dependency, parent: DependencyGraph | None, loader: DependencyLoader):
self._dependency = dependency
self._parent = parent
self._loader = loader
self._dependencies: dict[Dependency, DependencyGraph] = {}

@property
def dependency(self):
return self._dependency

@property
def path(self):
return self._dependency.path

def register_dependency(self, dependency: Dependency) -> DependencyGraph | None:
# already registered ?
child_graph = self.locate_dependency(dependency)
if child_graph is not None:
self._dependencies[dependency] = child_graph
return child_graph
# nay, create the child graph and populate it
child_graph = DependencyGraph(dependency, self, self._loader)
self._dependencies[dependency] = child_graph
container = self._loader.load_dependency(dependency)
if not container:
return None
container.build_dependency_graph(child_graph)
return child_graph

def locate_dependency(self, dependency: Dependency) -> DependencyGraph | None:
# need a list since unlike JS, Python won't let you assign closure variables
found: list[DependencyGraph] = []
path = dependency.path
# TODO https://github.com/databrickslabs/ucx/issues/1287
path = path[2:] if path.startswith('./') else path

def check_registered_dependency(graph):
# TODO https://github.com/databrickslabs/ucx/issues/1287
graph_path = graph.path[2:] if graph.path.startswith('./') else graph.path
if graph_path == path:
found.append(graph)
return True
return False

self.root.visit(check_registered_dependency)
return found[0] if len(found) > 0 else None

@property
def root(self):
return self if self._parent is None else self._parent.root

@property
def dependencies(self) -> set[Dependency]:
dependencies: set[Dependency] = set()

def add_to_dependencies(graph: DependencyGraph) -> bool:
if graph.dependency in dependencies:
return True
dependencies.add(graph.dependency)
return False

self.visit(add_to_dependencies)
return dependencies

@property
def paths(self) -> set[str]:
return {d.path for d in self.dependencies}

# when visit_node returns True it interrupts the visit
def visit(self, visit_node: Callable[[DependencyGraph], bool | None]) -> bool | None:
if visit_node(self):
return True
for dependency in self._dependencies.values():
if dependency.visit(visit_node):
return True
return False
84 changes: 10 additions & 74 deletions src/databricks/labs/ucx/source_code/notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import logging
from abc import ABC, abstractmethod
from ast import parse as parse_python
from collections.abc import Callable
from enum import Enum

from sqlglot import ParseError as SQLParseError
from sqlglot import parse as parse_sql
from databricks.sdk.service.workspace import Language
from databricks.sdk.service.workspace import Language, ObjectType

from databricks.labs.ucx.source_code.dependencies import DependencyGraph, Dependency, SourceContainer
from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter


Expand Down Expand Up @@ -79,7 +79,8 @@ def build_dependency_graph(self, parent: DependencyGraph):
assert isinstance(node, ast.Call)
path = PythonLinter.get_dbutils_notebook_run_path_arg(node)
if isinstance(path, ast.Constant):
parent.register_dependency(path.value.strip("'").strip('"'))
dependency = Dependency(ObjectType.NOTEBOOK, path.value.strip("'").strip('"'))
parent.register_dependency(dependency)


class RCell(Cell):
Expand Down Expand Up @@ -155,7 +156,7 @@ def build_dependency_graph(self, parent: DependencyGraph):
start = line.index(command)
if start >= 0:
path = line[start + len(command) :].strip()
parent.register_dependency(path.strip('"'))
parent.register_dependency(Dependency(ObjectType.NOTEBOOK, path.strip('"')))
return
raise ValueError("Missing notebook path in %run command")

Expand Down Expand Up @@ -297,76 +298,7 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str:
return "\n".join(lines)


class DependencyGraph:

def __init__(self, path: str, parent: DependencyGraph | None, locator: Callable[[str], Notebook]):
self._path = path
self._parent = parent
self._locator = locator
self._dependencies: dict[str, DependencyGraph] = {}

@property
def path(self):
return self._path

def register_dependency(self, path: str) -> DependencyGraph | None:
# already registered ?
child_graph = self.locate_dependency(path)
if child_graph is not None:
self._dependencies[path] = child_graph
return child_graph
# nay, create the child graph and populate it
child_graph = DependencyGraph(path, self, self._locator)
self._dependencies[path] = child_graph
notebook = self._locator(path)
if not notebook:
return None
notebook.build_dependency_graph(child_graph)
return child_graph

def locate_dependency(self, path: str) -> DependencyGraph | None:
# need a list since unlike JS, Python won't let you assign closure variables
found: list[DependencyGraph] = []
path = path[2:] if path.startswith('./') else path

def check_registered_dependency(graph):
graph_path = graph.path[2:] if graph.path.startswith('./') else graph.path
if graph_path == path:
found.append(graph)
return True
return False

self.root.visit(check_registered_dependency)
return found[0] if len(found) > 0 else None

@property
def root(self):
return self if self._parent is None else self._parent.root

@property
def paths(self) -> set[str]:
paths: set[str] = set()

def add_to_paths(graph: DependencyGraph) -> bool:
if graph.path in paths:
return True
paths.add(graph.path)
return False

self.visit(add_to_paths)
return paths

# when visit_node returns True it interrupts the visit
def visit(self, visit_node: Callable[[DependencyGraph], bool | None]) -> bool | None:
if visit_node(self):
return True
for dependency in self._dependencies.values():
if dependency.visit(visit_node):
return True
return False


class Notebook:
class Notebook(SourceContainer):

@staticmethod
def parse(path: str, source: str, default_language: Language) -> Notebook:
Expand All @@ -383,6 +315,10 @@ def __init__(self, path: str, source: str, language: Language, cells: list[Cell]
self._cells = cells
self._ends_with_lf = ends_with_lf

@property
def object_type(self) -> ObjectType:
return ObjectType.NOTEBOOK

@property
def path(self) -> str:
return self._path
Expand Down
53 changes: 17 additions & 36 deletions src/databricks/labs/ucx/source_code/notebook_migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,25 @@
from databricks.sdk.service.workspace import ExportFormat, ObjectInfo, ObjectType

from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.notebook import DependencyGraph, Notebook, RunCell
from databricks.labs.ucx.source_code.notebook import Notebook, RunCell
from databricks.labs.ucx.source_code.dependencies import DependencyGraph, Dependency, DependencyLoader


class NotebookMigrator:
def __init__(self, ws: WorkspaceClient, languages: Languages):
def __init__(self, ws: WorkspaceClient, languages: Languages, loader: DependencyLoader):
self._ws = ws
self._languages = languages
self._loader = loader

def build_dependency_graph(self, object_info: ObjectInfo) -> DependencyGraph:
if not object_info.path or not object_info.language or object_info.object_type is not ObjectType.NOTEBOOK:
raise ValueError("Not a valid Notebook")
dependency = Dependency.from_object_info(object_info)
graph = DependencyGraph(dependency, None, self._loader)
container = self._loader.load_dependency(dependency)
if container is not None:
container.build_dependency_graph(graph)
return graph

def revert(self, object_info: ObjectInfo):
if not object_info.path:
Expand All @@ -21,23 +33,16 @@ def revert(self, object_info: ObjectInfo):
def apply(self, object_info: ObjectInfo) -> bool:
if not object_info.path or not object_info.language or object_info.object_type is not ObjectType.NOTEBOOK:
return False
notebook = self._load_notebook(object_info)
notebook = self._loader.load_dependency(Dependency.from_object_info(object_info))
assert isinstance(notebook, Notebook)
return self._apply(notebook)

def build_dependency_graph(self, object_info: ObjectInfo) -> DependencyGraph:
if not object_info.path or not object_info.language or object_info.object_type is not ObjectType.NOTEBOOK:
raise ValueError("Not a valid Notebook")
notebook = self._load_notebook(object_info)
dependencies = DependencyGraph(object_info.path, None, self._load_notebook_from_path)
notebook.build_dependency_graph(dependencies)
return dependencies

def _apply(self, notebook: Notebook) -> bool:
changed = False
for cell in notebook.cells:
# %run is not a supported language, so this needs to come first
if isinstance(cell, RunCell):
# TODO data on what to change to ?
# TODO migration data ?
if cell.migrate_notebook_path():
changed = True
continue
Expand All @@ -52,27 +57,3 @@ def _apply(self, notebook: Notebook) -> bool:
self._ws.workspace.upload(notebook.path, notebook.to_migrated_code().encode("utf-8"))
# TODO https://github.com/databrickslabs/ucx/issues/1327 store 'migrated' status
return changed

def _load_notebook_from_path(self, path: str) -> Notebook:
object_info = self._load_object(path)
if object_info.object_type is not ObjectType.NOTEBOOK:
raise ValueError(f"Not a Notebook: {path}")
return self._load_notebook(object_info)

def _load_object(self, path: str) -> ObjectInfo:
result = self._ws.workspace.list(path)
object_info = next((oi for oi in result), None)
if object_info is None:
raise ValueError(f"Could not locate object at '{path}'")
return object_info

def _load_notebook(self, object_info: ObjectInfo) -> Notebook:
assert object_info is not None and object_info.path is not None and object_info.language is not None
source = self._load_source(object_info)
return Notebook.parse(object_info.path, source, object_info.language)

def _load_source(self, object_info: ObjectInfo) -> str:
if not object_info.language or not object_info.path:
raise ValueError(f"Invalid ObjectInfo: {object_info}")
with self._ws.workspace.download(object_info.path, format=ExportFormat.SOURCE) as f:
return f.read().decode("utf-8")
Loading
Loading