diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000..9fb5add --- /dev/null +++ b/datasets/__init__.py @@ -0,0 +1,2 @@ +from . import src +from .src import * diff --git a/datasets/src/__init__.py b/datasets/src/__init__.py new file mode 100644 index 0000000..0ae9f25 --- /dev/null +++ b/datasets/src/__init__.py @@ -0,0 +1,4 @@ +__all__ = ["beer_reviews", "colleges", "crime_data", "employee_salaries", + "journal_influence", "medical_charge", "met_objects", + "midwest_survey", "open_payments", "road_safety", + "traffic_violations"] diff --git a/src/beer_reviews.py b/datasets/src/beer_reviews.py similarity index 95% rename from src/beer_reviews.py rename to datasets/src/beer_reviews.py index 082e5e4..3f3b752 100644 --- a/src/beer_reviews.py +++ b/datasets/src/beer_reviews.py @@ -3,7 +3,7 @@ import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/colleges.py b/datasets/src/colleges.py similarity index 97% rename from src/colleges.py rename to datasets/src/colleges.py index b2dd496..ffd79a1 100644 --- a/src/colleges.py +++ b/datasets/src/colleges.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from common.file_management import fetch, write_df, float_to_int +from .common.file_management import fetch, write_df, float_to_int DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/common/__init__.py b/datasets/src/common/__init__.py similarity index 100% rename from src/common/__init__.py rename to datasets/src/common/__init__.py diff --git a/src/common/file_management.py b/datasets/src/common/file_management.py similarity index 93% rename from src/common/file_management.py rename to datasets/src/common/file_management.py index 55feff9..3591aa9 100644 --- a/src/common/file_management.py +++ b/datasets/src/common/file_management.py @@ -17,12 +17,11 @@ def _unzip(file, data_dir='./'): z.close() -def _untargz(file, ext): - gz = gzip.open(file) - out = open(file, 'wb') - shutil.copyfileobj(gz, out, 8192) - gz.close() - out.close() +def _untargz(file, data_dir): + out = os.path.join(data_dir, os.path.splitext(file)[0]) + with gzip.open(file) as gz: + with open(out, 'wb') as f_out: + shutil.copyfileobj(gz, f_out, 8192) def _untar(file, data_dir): @@ -42,7 +41,7 @@ def _uncompress_file(file, data_dir, delete_archive): _unzip(file, data_dir) processed = True elif ext == '.gz' or header.startswith(b'\x1f\x8b'): - _untargz(file, ext) + _untargz(file, data_dir) processed = True if os.path.isfile(file) and tarfile.is_tarfile(file): _untar(file, data_dir) diff --git a/src/crime_data.py b/datasets/src/crime_data.py similarity index 96% rename from src/crime_data.py rename to datasets/src/crime_data.py index 3696d24..d72ccb1 100644 --- a/src/crime_data.py +++ b/datasets/src/crime_data.py @@ -3,7 +3,7 @@ import pandas as pd -from common.file_management import fetch, write_df, float_to_int +from .common.file_management import fetch, write_df, float_to_int DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/employee_salaries.py b/datasets/src/employee_salaries.py similarity index 96% rename from src/employee_salaries.py rename to datasets/src/employee_salaries.py index e436d58..33df3f2 100644 --- a/src/employee_salaries.py +++ b/datasets/src/employee_salaries.py @@ -4,7 +4,7 @@ import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/journal_influence.py b/datasets/src/journal_influence.py similarity index 94% rename from src/journal_influence.py rename to datasets/src/journal_influence.py index 5e9d54a..75375d9 100644 --- a/src/journal_influence.py +++ b/datasets/src/journal_influence.py @@ -3,7 +3,7 @@ import pandas as pd -from common.file_management import fetch, write_df, float_to_int +from .common.file_management import fetch, write_df, float_to_int DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/medical_charge.py b/datasets/src/medical_charge.py similarity index 96% rename from src/medical_charge.py rename to datasets/src/medical_charge.py index 9b82b4e..039cb01 100644 --- a/src/medical_charge.py +++ b/datasets/src/medical_charge.py @@ -3,7 +3,7 @@ import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/met_objects.py b/datasets/src/met_objects.py similarity index 97% rename from src/met_objects.py rename to datasets/src/met_objects.py index 6aa20a3..9d208cc 100644 --- a/src/met_objects.py +++ b/datasets/src/met_objects.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/midwest_survey.py b/datasets/src/midwest_survey.py similarity index 99% rename from src/midwest_survey.py rename to datasets/src/midwest_survey.py index fe11c8d..3d603c7 100644 --- a/src/midwest_survey.py +++ b/datasets/src/midwest_survey.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/open_payments.py b/datasets/src/open_payments.py similarity index 97% rename from src/open_payments.py rename to datasets/src/open_payments.py index 77086b8..3803a85 100644 --- a/src/open_payments.py +++ b/datasets/src/open_payments.py @@ -3,7 +3,7 @@ import pandas as pd -from common.file_management import fetch, write_df +from .common.file_management import fetch, write_df amount = ['Total_Amount_of_Payment_USDollars'] corp = ['Applicable_Manufacturer_or_Applicable_GPO_Making_' + 'Payment_Name'] diff --git a/src/openml_beer_upload.py b/datasets/src/openml_beer_upload.py similarity index 100% rename from src/openml_beer_upload.py rename to datasets/src/openml_beer_upload.py diff --git a/src/openml_colleges_upload.py b/datasets/src/openml_colleges_upload.py similarity index 100% rename from src/openml_colleges_upload.py rename to datasets/src/openml_colleges_upload.py diff --git a/src/openml_crime_upload.py b/datasets/src/openml_crime_upload.py similarity index 100% rename from src/openml_crime_upload.py rename to datasets/src/openml_crime_upload.py diff --git a/src/openml_employee_upload.py b/datasets/src/openml_employee_upload.py similarity index 100% rename from src/openml_employee_upload.py rename to datasets/src/openml_employee_upload.py diff --git a/src/openml_journal_upload.py b/datasets/src/openml_journal_upload.py similarity index 100% rename from src/openml_journal_upload.py rename to datasets/src/openml_journal_upload.py diff --git a/src/openml_medical_upload.py b/datasets/src/openml_medical_upload.py similarity index 100% rename from src/openml_medical_upload.py rename to datasets/src/openml_medical_upload.py diff --git a/src/openml_met_upload.py b/datasets/src/openml_met_upload.py similarity index 100% rename from src/openml_met_upload.py rename to datasets/src/openml_met_upload.py diff --git a/src/openml_midwest_upload.py b/datasets/src/openml_midwest_upload.py similarity index 100% rename from src/openml_midwest_upload.py rename to datasets/src/openml_midwest_upload.py diff --git a/src/openml_payments_upload.py b/datasets/src/openml_payments_upload.py similarity index 100% rename from src/openml_payments_upload.py rename to datasets/src/openml_payments_upload.py diff --git a/src/openml_road_safety_upload.py b/datasets/src/openml_road_safety_upload.py similarity index 100% rename from src/openml_road_safety_upload.py rename to datasets/src/openml_road_safety_upload.py diff --git a/src/openml_traffic_upload.py b/datasets/src/openml_traffic_upload.py similarity index 100% rename from src/openml_traffic_upload.py rename to datasets/src/openml_traffic_upload.py diff --git a/src/road_safety.py b/datasets/src/road_safety.py similarity index 99% rename from src/road_safety.py rename to datasets/src/road_safety.py index 58833b4..95d2836 100644 --- a/src/road_safety.py +++ b/datasets/src/road_safety.py @@ -5,7 +5,7 @@ import pandas as pd import xlrd -from common.file_management import fetch, write_df, float_to_int +from .common.file_management import fetch, write_df, float_to_int DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/src/test.py b/datasets/src/test.py similarity index 55% rename from src/test.py rename to datasets/src/test.py index ac873e6..649bef8 100644 --- a/src/test.py +++ b/datasets/src/test.py @@ -1,14 +1,14 @@ -from colleges import get_colleges_df -from crime_data import get_crime_df -from employee_salaries import get_employee_salaries_df -from journal_influence import get_journal_influence_df -from medical_charge import get_medical_charge_df -from met_objects import get_met_objects_df -from midwest_survey import get_midwest_survey_df -from open_payments import get_open_payment_df -from road_safety import get_road_safety_df -from traffic_violations import get_traffic_violations_df -from beer_reviews import get_beer_reviews_df +from datasets.src.colleges import get_colleges_df +from datasets.src.crime_data import get_crime_df +from datasets.src.employee_salaries import get_employee_salaries_df +from datasets.src.journal_influence import get_journal_influence_df +from datasets.src.medical_charge import get_medical_charge_df +from datasets.src.met_objects import get_met_objects_df +from datasets.src.midwest_survey import get_midwest_survey_df +from datasets.src.open_payments import get_open_payment_df +from datasets.src.road_safety import get_road_safety_df +from datasets.src.traffic_violations import get_traffic_violations_df +from datasets.src.beer_reviews import get_beer_reviews_df import asyncio func_map = { diff --git a/src/traffic_violations.py b/datasets/src/traffic_violations.py similarity index 96% rename from src/traffic_violations.py rename to datasets/src/traffic_violations.py index 074f4c3..5902f2b 100644 --- a/src/traffic_violations.py +++ b/datasets/src/traffic_violations.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from common.file_management import fetch, write_df, float_to_int +from .common.file_management import fetch, write_df, float_to_int DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source']) UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress']) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8cbd7a2 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +import os +from setuptools import setup, find_packages + +description_file = os.path.join(os.path.dirname(__file__), 'README.md') +with open(description_file) as fh: + DESCRIPTION = fh.read() + + +if __name__ == '__main__': + setup(name='datasets', + version='0.0.1', + author='Patricio Cerda', + author_email='patricio.cerda@inria.fr', + description=("Datasets for prediction with string categorical variables"), + long_description=DESCRIPTION, + license='BSD', + classifiers=[ + 'Environment :: Console', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Topic :: Scientific/Engineering', + 'Topic :: Software Development :: Libraries', + ], + platforms='any', + packages=find_packages(), + install_requires=['pandas', 'requests'], + ) diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000