@@ -4,9 +4,11 @@ import (
4
4
"context"
5
5
"errors"
6
6
"log/slog"
7
+ "strings"
7
8
8
9
"github.com/farbodahm/streame/pkg/functions"
9
10
"github.com/farbodahm/streame/pkg/functions/join"
11
+ "github.com/farbodahm/streame/pkg/state_store"
10
12
"github.com/farbodahm/streame/pkg/types"
11
13
"github.com/farbodahm/streame/pkg/utils"
12
14
"github.com/google/uuid"
@@ -24,6 +26,15 @@ type StreamDataFrame struct {
24
26
Stages []Stage
25
27
Schema types.Schema
26
28
Configs * Config
29
+
30
+ stateStore state_store.StateStore
31
+ // previousExecutors holds all of the SDFs which current SDF is relying on.
32
+ // Currently only `Join` operation requires this structure so that it can first run
33
+ // all of the previous SDFs before running itself.
34
+ previousExecutors []* StreamDataFrame
35
+
36
+ // TODO: Refactor for a better way for storing runtime configs
37
+ runtimeConfig map [string ]any
27
38
}
28
39
29
40
// NewStreamDataFrame creates a new StreamDataFrame with the given options
@@ -37,13 +48,20 @@ func NewStreamDataFrame(
37
48
) StreamDataFrame {
38
49
// Create config with default values
39
50
config := Config {
40
- LogLevel : slog .LevelInfo ,
51
+ LogLevel : slog .LevelInfo ,
52
+ StateStore : state_store .NewInMemorySS (),
41
53
}
42
54
// Functional Option pattern
43
55
for _ , option := range options {
44
56
option (& config )
45
57
}
46
58
59
+ utils .InitLogger (config .LogLevel )
60
+
61
+ if _ , ok := config .StateStore .(* state_store.InMemorySS ); ok {
62
+ utils .Logger .Warn ("Using in-memory state store. This is not suitable for production use." )
63
+ }
64
+
47
65
sdf := StreamDataFrame {
48
66
SourceStream : sourceStream ,
49
67
OutputStream : outputStream ,
@@ -52,10 +70,18 @@ func NewStreamDataFrame(
52
70
Stages : []Stage {},
53
71
Schema : schema ,
54
72
Configs : & config ,
73
+
74
+ runtimeConfig : make (map [string ]any ),
75
+ stateStore : config .StateStore ,
76
+ previousExecutors : []* StreamDataFrame {},
77
+ }
78
+
79
+ // Only source streams need to have schema validation. When a SDF
80
+ // is created by joining 2 other streams, it doesn't need any schema validation stage.
81
+ if ! strings .HasSuffix (streamName , join .JoinedStreamSuffix ) {
82
+ sdf .validateSchema ()
55
83
}
56
84
57
- sdf .validateSchema ()
58
- utils .InitLogger (config .LogLevel )
59
85
return sdf
60
86
}
61
87
@@ -85,8 +111,45 @@ func (sdf *StreamDataFrame) Select(columns ...string) DataFrame {
85
111
}
86
112
87
113
// Join joins the DataFrame with another DataFrame based on the given join type and condition
88
- func (sdf * StreamDataFrame ) Join (other DataFrame , how join.JoinType , on join.JoinCondition ) DataFrame {
89
- panic ("Not Implemented" )
114
+ func (sdf * StreamDataFrame ) Join (other * StreamDataFrame , how join.JoinType , on join.JoinCondition , mode join.JoinMode ) DataFrame {
115
+ // Validate join condition
116
+ err := join .ValidateJoinCondition (sdf .Schema , other .Schema , on )
117
+ if err != nil {
118
+ panic (err )
119
+ }
120
+
121
+ // Merge schemas
122
+ new_schema , err := join .MergeSchema (sdf .Schema , other .GetSchema ())
123
+ if err != nil {
124
+ panic (err )
125
+ }
126
+
127
+ // Fan-In pattern to join 2 streams into 1 stream
128
+ merged_sources := utils .MergeChannels (sdf .OutputStream , other .OutputStream )
129
+ merged_errors := utils .MergeChannels (sdf .ErrorStream , other .ErrorStream )
130
+
131
+ out := make (chan (types.Record ))
132
+ new_sdf := NewStreamDataFrame (
133
+ merged_sources ,
134
+ out ,
135
+ merged_errors ,
136
+ new_schema ,
137
+ sdf .Name + "-" + other .Name + join .JoinedStreamSuffix ,
138
+ )
139
+ // TODO: Decide on configs
140
+ new_sdf .Configs = sdf .Configs
141
+
142
+ new_sdf .runtimeConfig [sdf .Name ] = join .Stream
143
+ new_sdf .runtimeConfig [other .Name ] = join .Table
144
+
145
+ executor := func (ctx context.Context , record types.Record ) ([]types.Record , error ) {
146
+ record_type := new_sdf .runtimeConfig [record .Metadata .Stream ].(join.RecordType )
147
+ return join .InnerJoinStreamTable (new_sdf .stateStore , record_type , record , on ), nil
148
+ }
149
+
150
+ new_sdf .previousExecutors = append (new_sdf .previousExecutors , sdf , other )
151
+ new_sdf .addToStages (executor )
152
+ return & new_sdf
90
153
}
91
154
92
155
// Filter applies filter function to each record of the DataFrame
@@ -206,11 +269,17 @@ func (sdf *StreamDataFrame) addToStages(executor StageExecutor) {
206
269
// It simply runs all of the stages.
207
270
// It's a blocking call and returns when the context is cancelled or panics when an error occurs.
208
271
func (sdf * StreamDataFrame ) Execute (ctx context.Context ) error {
209
- utils .Logger .Info ("Executing processor with" , "len(stages)" , len (sdf .Stages ))
272
+ utils .Logger .Info ("Executing processor" , "name" , sdf . Name , "len(stages)" , len (sdf .Stages ))
210
273
if len (sdf .Stages ) == 0 {
211
274
return errors .New ("no stages are created" )
212
275
}
213
276
277
+ // Execute previous SDFs which current SDF depends on first (if there are any)
278
+ for _ , previous_sdf := range sdf .previousExecutors {
279
+ utils .Logger .Info ("Executing previous SDF" , "name" , previous_sdf .Name )
280
+ go previous_sdf .Execute (ctx )
281
+ }
282
+
214
283
for _ , stage := range sdf .Stages {
215
284
go stage .Run (ctx )
216
285
}
@@ -220,7 +289,7 @@ func (sdf *StreamDataFrame) Execute(ctx context.Context) error {
220
289
case err := <- sdf .ErrorStream :
221
290
panic (err )
222
291
case <- ctx .Done ():
223
- utils .Logger .Info ("Processor execution completed" )
292
+ utils .Logger .Info ("Processor execution completed" , "name" , sdf . Name )
224
293
return nil // Exit the loop if the context is cancelled
225
294
}
226
295
}
0 commit comments