@@ -85,11 +85,12 @@ def __init__(self, data_id, req):
85
85
self .analysis_type = get_str_arg (req , "type" )
86
86
curr_settings = global_state .get_settings (data_id ) or {}
87
87
self .query = build_query (data_id , curr_settings .get ("query" ))
88
- data = load_filterable_data (data_id , req , query = self .query )
88
+ self . data = load_filterable_data (data_id , req , query = self .query )
89
89
self .selected_col = find_selected_column (
90
- data , get_str_arg (req , "col" , "values" )
90
+ self . data , get_str_arg (req , "col" , "values" )
91
91
)
92
- self .data = data [~ pd .isnull (data [self .selected_col ])]
92
+ if self .analysis_type != "frequency" :
93
+ self .data = self .data [~ pd .isnull (self .data [self .selected_col ])]
93
94
self .dtype = find_dtype (self .data [self .selected_col ])
94
95
self .classifier = classify_type (self .dtype )
95
96
self .code = build_code_export (
@@ -122,6 +123,8 @@ def __init__(self, data_id, req):
122
123
self .analysis = WordValueCountAnalysis (req )
123
124
elif self .analysis_type == "qq" :
124
125
self .analysis = QQAnalysis ()
126
+ elif self .analysis_type == "frequency" :
127
+ self .analysis = FrequencyAnalysis (req )
125
128
126
129
def build (self ):
127
130
base_code = build_code_export (
@@ -390,6 +393,18 @@ def _build_code(self, parent, top_code):
390
393
return code
391
394
392
395
396
+ def build_hist (s , code , df_var = "chart" ):
397
+ code .append ("{} = pd.value_counts(s).to_frame(name='data')" .format (df_var ))
398
+ code .append (
399
+ "{df_var}['percent'] = ({df_var}['data'] / {df_var}['data'].sum()) * 100" .format (
400
+ df_var = df_var
401
+ )
402
+ )
403
+ df = pd .value_counts (s ).to_frame (name = "data" )
404
+ df ["percent" ] = (df ["data" ] / df ["data" ].sum ()) * 100
405
+ return df
406
+
407
+
393
408
class ValueCountAnalysis (object ):
394
409
def __init__ (self , req ):
395
410
self .top = get_int_arg (req , "top" )
@@ -398,11 +413,7 @@ def __init__(self, req):
398
413
self .cleaners = get_str_arg (req , "cleaner" )
399
414
400
415
def build_hist (self , s , code ):
401
- code .append ("chart = pd.value_counts(s).to_frame(name='data')" )
402
- code .append ("chart['percent'] = (chart['data'] / chart['data'].sum()) * 100" )
403
- df = pd .value_counts (s ).to_frame (name = "data" )
404
- df ["percent" ] = (df ["data" ] / df ["data" ].sum ()) * 100
405
- return df
416
+ return build_hist (s , code )
406
417
407
418
def setup_ordinal_data (self , parent ):
408
419
if self .ordinal_agg == "pctsum" :
@@ -659,3 +670,110 @@ def _build_code(self, parent):
659
670
"figure = go.Figure(data=chart, layout=go.{layout})" .format (layout = layout ),
660
671
]
661
672
return code
673
+
674
+
675
+ """
676
+ http://anschonfel.hn.res.num:9205/dtale/column-analysis/1?col=str_val&top=10&type=frequency&filtered=false
677
+ """
678
+
679
+
680
+ class FrequencyAnalysis (object ):
681
+ def __init__ (self , req ):
682
+ self .top = get_int_arg (req , "top" )
683
+ self .split_cols = get_str_arg (req , "splits" , "" )
684
+ if self .split_cols :
685
+ self .split_cols = self .split_cols .split ("," )
686
+ else :
687
+ self .split_cols = []
688
+ self .cleaners = get_str_arg (req , "cleaner" )
689
+
690
+ def build (self , parent ):
691
+ code = []
692
+ if parent .classifier == "S" :
693
+ code .append (
694
+ "s = df.fillna('Missing')['{col}']" .format (col = parent .selected_col )
695
+ )
696
+ s , cleaner_code = handle_cleaners (
697
+ parent .data [parent .selected_col ].fillna ("Missing" ), self .cleaners
698
+ )
699
+ code += cleaner_code
700
+ else :
701
+ code .append (
702
+ "s = df['{col}'].fillna('Missing').astype(str)" .format (
703
+ col = parent .selected_col
704
+ )
705
+ )
706
+ formatter = find_dtype_formatter (parent .dtype )
707
+ s = parent .data [parent .selected_col ].apply (
708
+ lambda x : formatter (x , nan_display = "Missing" )
709
+ )
710
+
711
+ df_var = "base_vals" if len (self .split_cols ) else "result"
712
+
713
+ base_vals = build_hist (s , code , df_var )
714
+ base_vals .index .name = parent .selected_col
715
+ base_vals = base_vals .rename (
716
+ columns = {"data" : "Frequency" , "percent" : "Percent" }
717
+ )
718
+ base_vals = base_vals [base_vals ["Frequency" ] > 0 ]
719
+ base_vals = base_vals .reset_index ().sort_values (
720
+ ["Frequency" , parent .selected_col ], ascending = [False , True ]
721
+ )
722
+ base_vals = base_vals .head (self .top )
723
+
724
+ code += [
725
+ "{}.index.name = '{}'" .format (df_var , parent .selected_col ),
726
+ "{df_var} = {df_var}.{rename}" .format (
727
+ df_var = df_var ,
728
+ rename = "rename(columns={'data': 'Frequency', 'percent': 'Percent'})" ,
729
+ ),
730
+ "{df_var} = {df_var}[{df_var}['Frequency'] > 0]" .format (df_var = df_var ),
731
+ "{df_var} = {df_var}.reset_index().sort_values(['Frequency', '{col}'], ascending=[False, True])" .format (
732
+ df_var = df_var , col = parent .selected_col
733
+ ),
734
+ "{df_var} = {df_var}.head(self.top)" .format (df_var = df_var ),
735
+ ]
736
+
737
+ if len (self .split_cols ):
738
+ top_vals = base_vals [parent .selected_col ]
739
+ val_filter = parent .data [parent .selected_col ].isin (top_vals )
740
+ val_filter_code = "val_filter = df['{col}'].isin(top_vals)" .format (
741
+ col = parent .selected_col
742
+ )
743
+ if (top_vals == "Missing" ).any ():
744
+ val_filter = val_filter | parent .data [parent .selected_col ].isnull ()
745
+ val_filter_code = "({val_filter} | df['{col}'].isnull())" .format (
746
+ val_filter = val_filter , col = parent .selected_col
747
+ )
748
+ hist = parent .data [val_filter ].groupby ([s ] + self .split_cols ).size ()
749
+ hist .name = "Frequency"
750
+ hist = hist .reset_index ()
751
+ hist = hist [hist ["Frequency" ] > 0 ]
752
+ outer_freq = hist .groupby (parent .selected_col )["Frequency" ].transform ("sum" )
753
+ hist ["Percent" ] = (hist ["Frequency" ] / outer_freq ) * 100
754
+
755
+ code += [
756
+ "top_vals = {df_var}['{col}']" .format (
757
+ df_var = df_var , col = parent .selected_col
758
+ ),
759
+ val_filter_code ,
760
+ "result = df[val_filter].groupby([s, '{}']).size()" .format (
761
+ "', '" .join (self .split_cols )
762
+ ),
763
+ "result.name = 'Frequency'" ,
764
+ "result = result.reset_index()" ,
765
+ "result = result[result['Frequency'] > 0]" ,
766
+ "outer_freq = result.groupby('{}')['Frequency'].transform('sum')" .format (
767
+ parent .selected_col
768
+ ),
769
+ "result['Percent'] = (result['Frequency'] / outer_freq) * 100" ,
770
+ ]
771
+ else :
772
+ hist = base_vals
773
+
774
+ col_types = grid_columns (hist )
775
+ f = grid_formatter (col_types , nan_display = None )
776
+ return_data = f .format_lists (hist )
777
+ return_data = dict (data = return_data )
778
+ return_data ["top" ] = self .top
779
+ return return_data , code
0 commit comments