Skip to content

Commit 4e299db

Browse files
committed
#803: Frequency Table
1 parent 6461f2a commit 4e299db

16 files changed

+583
-33
lines changed

dtale/column_analysis.py

+126-8
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,12 @@ def __init__(self, data_id, req):
8585
self.analysis_type = get_str_arg(req, "type")
8686
curr_settings = global_state.get_settings(data_id) or {}
8787
self.query = build_query(data_id, curr_settings.get("query"))
88-
data = load_filterable_data(data_id, req, query=self.query)
88+
self.data = load_filterable_data(data_id, req, query=self.query)
8989
self.selected_col = find_selected_column(
90-
data, get_str_arg(req, "col", "values")
90+
self.data, get_str_arg(req, "col", "values")
9191
)
92-
self.data = data[~pd.isnull(data[self.selected_col])]
92+
if self.analysis_type != "frequency":
93+
self.data = self.data[~pd.isnull(self.data[self.selected_col])]
9394
self.dtype = find_dtype(self.data[self.selected_col])
9495
self.classifier = classify_type(self.dtype)
9596
self.code = build_code_export(
@@ -122,6 +123,8 @@ def __init__(self, data_id, req):
122123
self.analysis = WordValueCountAnalysis(req)
123124
elif self.analysis_type == "qq":
124125
self.analysis = QQAnalysis()
126+
elif self.analysis_type == "frequency":
127+
self.analysis = FrequencyAnalysis(req)
125128

126129
def build(self):
127130
base_code = build_code_export(
@@ -390,6 +393,18 @@ def _build_code(self, parent, top_code):
390393
return code
391394

392395

396+
def build_hist(s, code, df_var="chart"):
397+
code.append("{} = pd.value_counts(s).to_frame(name='data')".format(df_var))
398+
code.append(
399+
"{df_var}['percent'] = ({df_var}['data'] / {df_var}['data'].sum()) * 100".format(
400+
df_var=df_var
401+
)
402+
)
403+
df = pd.value_counts(s).to_frame(name="data")
404+
df["percent"] = (df["data"] / df["data"].sum()) * 100
405+
return df
406+
407+
393408
class ValueCountAnalysis(object):
394409
def __init__(self, req):
395410
self.top = get_int_arg(req, "top")
@@ -398,11 +413,7 @@ def __init__(self, req):
398413
self.cleaners = get_str_arg(req, "cleaner")
399414

400415
def build_hist(self, s, code):
401-
code.append("chart = pd.value_counts(s).to_frame(name='data')")
402-
code.append("chart['percent'] = (chart['data'] / chart['data'].sum()) * 100")
403-
df = pd.value_counts(s).to_frame(name="data")
404-
df["percent"] = (df["data"] / df["data"].sum()) * 100
405-
return df
416+
return build_hist(s, code)
406417

407418
def setup_ordinal_data(self, parent):
408419
if self.ordinal_agg == "pctsum":
@@ -659,3 +670,110 @@ def _build_code(self, parent):
659670
"figure = go.Figure(data=chart, layout=go.{layout})".format(layout=layout),
660671
]
661672
return code
673+
674+
675+
"""
676+
http://anschonfel.hn.res.num:9205/dtale/column-analysis/1?col=str_val&top=10&type=frequency&filtered=false
677+
"""
678+
679+
680+
class FrequencyAnalysis(object):
681+
def __init__(self, req):
682+
self.top = get_int_arg(req, "top")
683+
self.split_cols = get_str_arg(req, "splits", "")
684+
if self.split_cols:
685+
self.split_cols = self.split_cols.split(",")
686+
else:
687+
self.split_cols = []
688+
self.cleaners = get_str_arg(req, "cleaner")
689+
690+
def build(self, parent):
691+
code = []
692+
if parent.classifier == "S":
693+
code.append(
694+
"s = df.fillna('Missing')['{col}']".format(col=parent.selected_col)
695+
)
696+
s, cleaner_code = handle_cleaners(
697+
parent.data[parent.selected_col].fillna("Missing"), self.cleaners
698+
)
699+
code += cleaner_code
700+
else:
701+
code.append(
702+
"s = df['{col}'].fillna('Missing').astype(str)".format(
703+
col=parent.selected_col
704+
)
705+
)
706+
formatter = find_dtype_formatter(parent.dtype)
707+
s = parent.data[parent.selected_col].apply(
708+
lambda x: formatter(x, nan_display="Missing")
709+
)
710+
711+
df_var = "base_vals" if len(self.split_cols) else "result"
712+
713+
base_vals = build_hist(s, code, df_var)
714+
base_vals.index.name = parent.selected_col
715+
base_vals = base_vals.rename(
716+
columns={"data": "Frequency", "percent": "Percent"}
717+
)
718+
base_vals = base_vals[base_vals["Frequency"] > 0]
719+
base_vals = base_vals.reset_index().sort_values(
720+
["Frequency", parent.selected_col], ascending=[False, True]
721+
)
722+
base_vals = base_vals.head(self.top)
723+
724+
code += [
725+
"{}.index.name = '{}'".format(df_var, parent.selected_col),
726+
"{df_var} = {df_var}.{rename}".format(
727+
df_var=df_var,
728+
rename="rename(columns={'data': 'Frequency', 'percent': 'Percent'})",
729+
),
730+
"{df_var} = {df_var}[{df_var}['Frequency'] > 0]".format(df_var=df_var),
731+
"{df_var} = {df_var}.reset_index().sort_values(['Frequency', '{col}'], ascending=[False, True])".format(
732+
df_var=df_var, col=parent.selected_col
733+
),
734+
"{df_var} = {df_var}.head(self.top)".format(df_var=df_var),
735+
]
736+
737+
if len(self.split_cols):
738+
top_vals = base_vals[parent.selected_col]
739+
val_filter = parent.data[parent.selected_col].isin(top_vals)
740+
val_filter_code = "val_filter = df['{col}'].isin(top_vals)".format(
741+
col=parent.selected_col
742+
)
743+
if (top_vals == "Missing").any():
744+
val_filter = val_filter | parent.data[parent.selected_col].isnull()
745+
val_filter_code = "({val_filter} | df['{col}'].isnull())".format(
746+
val_filter=val_filter, col=parent.selected_col
747+
)
748+
hist = parent.data[val_filter].groupby([s] + self.split_cols).size()
749+
hist.name = "Frequency"
750+
hist = hist.reset_index()
751+
hist = hist[hist["Frequency"] > 0]
752+
outer_freq = hist.groupby(parent.selected_col)["Frequency"].transform("sum")
753+
hist["Percent"] = (hist["Frequency"] / outer_freq) * 100
754+
755+
code += [
756+
"top_vals = {df_var}['{col}']".format(
757+
df_var=df_var, col=parent.selected_col
758+
),
759+
val_filter_code,
760+
"result = df[val_filter].groupby([s, '{}']).size()".format(
761+
"', '".join(self.split_cols)
762+
),
763+
"result.name = 'Frequency'",
764+
"result = result.reset_index()",
765+
"result = result[result['Frequency'] > 0]",
766+
"outer_freq = result.groupby('{}')['Frequency'].transform('sum')".format(
767+
parent.selected_col
768+
),
769+
"result['Percent'] = (result['Frequency'] / outer_freq) * 100",
770+
]
771+
else:
772+
hist = base_vals
773+
774+
col_types = grid_columns(hist)
775+
f = grid_formatter(col_types, nan_display=None)
776+
return_data = f.format_lists(hist)
777+
return_data = dict(data=return_data)
778+
return_data["top"] = self.top
779+
return return_data, code

frontend/static/__tests__/popups/ColumnAnalysis.test.support.ts

+6-6
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ export const ANALYSIS_DATA = {
33
chart_type: 'histogram',
44
dtype: 'float64',
55
cols: [
6-
{ name: 'intCol', dtype: 'int64' },
7-
{ name: 'bar', dtype: 'float64', coord: 'lat' },
8-
{ name: 'strCol', dtype: 'string' },
9-
{ name: 'dateCol', dtype: 'datetime' },
10-
{ name: 'baz', dtype: 'float64' },
11-
{ name: 'lon', coord: 'lon', dtype: 'float64' },
6+
{ index: 0, name: 'intCol', dtype: 'int64' },
7+
{ index: 1, name: 'bar', dtype: 'float64', coord: 'lat' },
8+
{ index: 2, name: 'strCol', dtype: 'string' },
9+
{ index: 3, name: 'dateCol', dtype: 'datetime' },
10+
{ index: 4, name: 'baz', dtype: 'float64' },
11+
{ index: 5, name: 'lon', coord: 'lon', dtype: 'float64' },
1212
],
1313
query: null,
1414
data: [6, 13, 13, 30, 34, 57, 84, 135, 141, 159, 170, 158, 126, 94, 70, 49, 19, 7, 9, 4],
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import { act, fireEvent, render, screen } from '@testing-library/react';
2+
import axios from 'axios';
3+
import * as React from 'react';
4+
import { Provider } from 'react-redux';
5+
6+
import { ColumnDef } from '../../../dtale/DataViewerState';
7+
import { DetailsCharts, DetailsChartsProps } from '../../../popups/describe/DetailsCharts';
8+
import * as ColumnAnalysisRepository from '../../../repository/ColumnAnalysisRepository';
9+
import DimensionsHelper from '../../DimensionsHelper';
10+
import reduxUtils from '../../redux-test-utils';
11+
import { buildInnerHTML, mockChartJS, parseUrlParams, selectOption } from '../../test-utils';
12+
import { ANALYSIS_DATA } from '../ColumnAnalysis.test.support';
13+
14+
describe('DetailsCharts tests', () => {
15+
let loadAnalysisSpy: jest.SpyInstance<
16+
Promise<ColumnAnalysisRepository.ColumnAnalysisResponse | undefined>,
17+
[dataId: string, params: Record<string, any>]
18+
>;
19+
const dimensions = new DimensionsHelper({
20+
offsetWidth: 500,
21+
offsetHeight: 500,
22+
});
23+
24+
const updateProps = async (propOverrides?: Partial<DetailsChartsProps>): Promise<void> => {
25+
const props = {
26+
details: {
27+
describe: {},
28+
uniques: { data: [] },
29+
dtype_counts: [],
30+
sequential_diffs: { diffs: { data: [] }, min: '', max: '', avg: '' },
31+
string_metrics: {},
32+
},
33+
cols: ANALYSIS_DATA.cols as ColumnDef[],
34+
dtype: 'string',
35+
col: 'strCol',
36+
propagateState: jest.fn(),
37+
filtered: false,
38+
...propOverrides,
39+
};
40+
const store = reduxUtils.createDtaleStore();
41+
buildInnerHTML({ settings: '' }, store);
42+
43+
await act(
44+
() =>
45+
render(
46+
<Provider store={store}>
47+
<DetailsCharts {...props} />
48+
</Provider>,
49+
{
50+
container: document.getElementById('content') ?? undefined,
51+
},
52+
).container,
53+
);
54+
};
55+
56+
beforeAll(() => {
57+
dimensions.beforeAll();
58+
mockChartJS();
59+
});
60+
61+
beforeEach(async () => {
62+
loadAnalysisSpy = jest.spyOn(ColumnAnalysisRepository, 'loadAnalysis');
63+
(axios.get as any).mockImplementation((url: string) => {
64+
if (url.startsWith('/dtale/column-analysis')) {
65+
const params = parseUrlParams(url);
66+
const ordinal = ANALYSIS_DATA.data;
67+
if (params.col === 'strCol') {
68+
if (params.type === 'frequency') {
69+
if (params.splits === 'intCol') {
70+
return Promise.resolve({
71+
data: {
72+
...ANALYSIS_DATA,
73+
dtype: 'string',
74+
chart_type: 'frequency',
75+
timestamp: new Date().getTime(),
76+
data: {
77+
strCol: ['a', 'a', 'b', 'b', 'c', 'c', 'd', 'd', 'd'],
78+
intCol: [0, 1, 0, 1, 0, 1, 0, 1, 'Missing'],
79+
Frequency: [1, 2, 2, 1, 2, 2, 3, 1, 1],
80+
Percent: [33.33, 66.66, 66.66, 33.33, 50.0, 50.0, 60.0, 20.0, 20.0],
81+
},
82+
},
83+
});
84+
}
85+
return Promise.resolve({
86+
data: {
87+
...ANALYSIS_DATA,
88+
dtype: 'string',
89+
chart_type: 'frequency',
90+
timestamp: new Date().getTime(),
91+
data: {
92+
strCol: ['a', 'b', 'c', 'd'],
93+
Frequency: [3, 3, 4, 5],
94+
Percent: [20.0, 20.0, 26.67, 33.33],
95+
},
96+
},
97+
});
98+
}
99+
return Promise.resolve({
100+
data: {
101+
...ANALYSIS_DATA,
102+
dtype: 'string',
103+
chart_type: 'value_counts',
104+
ordinal,
105+
timestamp: new Date().getTime(),
106+
},
107+
});
108+
}
109+
}
110+
return Promise.resolve({ data: reduxUtils.urlFetcher(url) });
111+
});
112+
});
113+
114+
afterEach(jest.restoreAllMocks);
115+
afterAll(dimensions.afterAll);
116+
117+
it('frequency grid functionality', async () => {
118+
await updateProps();
119+
await act(async () => {
120+
await fireEvent.click(screen.getByText('Frequency Table'));
121+
});
122+
expect(loadAnalysisSpy).toHaveBeenLastCalledWith('1', {
123+
selectedCol: 'strCol',
124+
query: '',
125+
bins: 20,
126+
top: 100,
127+
density: false,
128+
type: 'frequency',
129+
filtered: false,
130+
splits: '',
131+
});
132+
let rows = screen.getByTestId('frequencies-grid').getElementsByClassName('ReactVirtualized__Table__row');
133+
expect(rows).toHaveLength(5);
134+
expect(rows[rows.length - 1].textContent).toBe('TOTAL15100.00%');
135+
await selectOption(
136+
screen.getByTestId('splits-select').getElementsByClassName('Select')[0] as HTMLElement,
137+
'intCol',
138+
);
139+
expect(loadAnalysisSpy).toHaveBeenLastCalledWith('1', {
140+
selectedCol: 'strCol',
141+
query: '',
142+
bins: 20,
143+
top: 100,
144+
density: false,
145+
type: 'frequency',
146+
filtered: false,
147+
splits: 'intCol',
148+
});
149+
rows = screen.getByTestId('frequencies-grid').getElementsByClassName('ReactVirtualized__Table__row');
150+
expect(rows).toHaveLength(13);
151+
expect(rows[rows.length - 1].textContent).toBe('TOTAL5100.00%');
152+
});
153+
});

frontend/static/popups/analysis/ColumnAnalysisState.ts

+8-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export enum AnalysisType {
1111
WORD_VALUE_COUNTS = 'word_value_counts',
1212
QQ = 'qq',
1313
BOXPLOT = 'boxplot',
14+
FREQUENCY = 'frequency',
1415
}
1516

1617
/** Base properties of data fetched for column analysis */
@@ -68,6 +69,11 @@ export interface QQChartData extends FetchedChartData<AnalysisType.QQ> {
6869
y2: number[];
6970
}
7071

72+
/** Properties for fetched Frequency Grid data */
73+
export interface FrequencyGridData extends FetchedChartData<AnalysisType.FREQUENCY> {
74+
data: { Frequency: number[]; Percent: number[] } & Record<string, any[]>;
75+
}
76+
7177
/** Properties for column analysis charts using chart.js */
7278
export type ChartJSAnalysisCharts =
7379
| HistogramChartData
@@ -79,7 +85,7 @@ export type ChartJSAnalysisCharts =
7985
export type PlotlyAnalysisCharts = GeolocationChartData | QQChartData;
8086

8187
/** Different charts available for column analysis */
82-
export type AnalysisCharts = ChartJSAnalysisCharts | PlotlyAnalysisCharts;
88+
export type AnalysisCharts = ChartJSAnalysisCharts | PlotlyAnalysisCharts | FrequencyGridData;
8389

8490
/** Parameters for building a column analysis */
8591
export interface AnalysisParams {
@@ -99,6 +105,7 @@ export interface AnalysisParams {
99105
latCol?: BaseOption<string>;
100106
lonCol?: BaseOption<string>;
101107
query?: string;
108+
splits?: Array<BaseOption<string>>;
102109
}
103110

104111
/** State properties of ColumnAnalysis */

0 commit comments

Comments
 (0)