# transform.py
from datetime import date
from html import unescape
import pandas as pd
from bs4 import BeautifulSoup
class Transform:
def __init__(self, df, config):
self.df = df
self.config = config
def save(self, output_path):
"""
Description:
save the cleaned df into csv
args:
output_path: path to output.csv
"""
self.df.to_csv(output_path, index=False)
def html_format_remover(self):
"""
Description:
remove html entities/character references in COLUMNS_TO_REMOVE_HTML(config) columns
"""
for col in self.config["COLUMNS_REMOVE_HTML"]:
self.df[col] = (
self.df[col].apply(
lambda x: BeautifulSoup(
unescape(x), "html.parser"
).text
if pd.notnull(x)
else x
)
)
return self.df
def reformat_string(self):
"""
Description:
convert to lowercase and remove all special characters in all COLUMN_STR_TYPES(config) columns
remove digits in product_name and product_description
"""
for col in self.config["COLUMN_STR_TYPES"]:
self.df[col] = self.df[col].astype(str)
self.df[col] = self.df[col].apply(
lambda x: x.lower()
)
if not col == "sku":
self.df[col] = self.df[col].str.replace(
r"[^\w\s]|_",
"",
regex=True
)
if col == "product_name" or col == "product_description":
self.df[col] = self.df[col].replace(
r'[\d]',
'',
regex=True
)
return self.df
def keep_unit_price_qyt_more_than_zero(self):
"""
Description:
Keep rows that have COLUMNS_KEEP_MORETHANZERO(config) greater than 0
"""
for col in self.config["COLUMNS_KEEP_MORETHANZERO"]:
self.df[col].fillna(0, inplace=True)
self.df = self.df[self.df[col] > 0]
return self.df
def age_group(self):
"""
Description:
add age_group column that is derived from COLUMN_CONVERT_AGE(config) column
"""
year = date.today().year
yob = self.config["COLUMN_CONVERT_AGE"][0]
self.df['age'] = self.df[yob].apply(
lambda x: x
if (pd.isnull(x) or x == '')
else year-int(x)
)
self.df['age'] = pd.to_numeric(self.df['age'])
self.df['age_group'] = pd.cut(
self.df['age'],
bins=self.config["AGE_BINS"],
labels=self.config["AGE_LABELS"],
right=False,
include_lowest=True
)
self.df = self.df.drop(columns=['age'])
return self.df
def region(self):
"""
Description:
add region column that is derived from COLUMN_CONVERT_POSTAL(config) column
"""
postal_sector = self.config["COLUMN_CONVERT_POSTAL"][0]
self.df[postal_sector] = self.df[postal_sector].astype(str)
self.df['region'] = self.df[postal_sector].apply(
lambda x: x[:2]
if not ((pd.isnull(x) or x == '') and len(x) == 2)
else (x[:1]
if not ((pd.isnull(x) or x == '') and len(x) == 1)
else ''
)
)
DICT_REGION = {
'north-east': [
"53", "54", "55", "56", "57",
"79",
"80", "82"
],
'north': [
"69",
"70", "71", "72", "73", "75", "76"
],
'central': [
"1", "2", "3", "4", "5", "6", "7", "8", "9",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
"14", "15", "16", "17", "18", "19",
"20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
"30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
"40", "41",
"58", "59",
"77", "78"
],
'west': [
"11", "12", "13",
"60", "61", "62", "63", "64", "65", "66", "67", "68"
],
'east': [
"42", "43", "44", "45", "46", "47", "48", "49",
"50", "51", "52",
"81"
]
}
mapping = {}
for key, values in DICT_REGION.items():
for item in values:
mapping[item] = key
self.df['region'] = self.df['region'].replace(mapping)
self.df['region'] = self.df['region'].apply(
lambda x: x
if (
x == 'north-east' or
x == 'north' or
x == 'central' or
x == 'west' or
x == 'east'
)
else ""
)
return self.df
def remove_spacing_from_sku(self):
"""
Description:
substitute COLUMN_SKU(config) spacing with "_"
substitute COLUMN_SKU(config) starting with 0 with string "A"
"""
sku = self.config["COLUMN_SKU"][0]
self.df[sku] = self.df[sku].replace(
r'[ ]',
'_',
regex=True
)
self.df[sku] = self.df[sku].apply(
lambda x: "A" + x[1:]
if x[0] == "0"
else x
)
return self.df
def combine_storeID_outlet_to_SKU(self):
"""
Description:
Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_SKU(config) if same COLUMN_SKU(config), different stores
"""
sku = self.config["COLUMN_SKU"][0]
sid = self.config["COLUMN_STOREID"][0]
outlet = self.config["COLUMN_OUTLET"][0]
self.df[sku] = self.df[sid] + "_" + self.df[outlet] + "_" + self.df[sku]
return self.df
def combine_storeID_outlet_to_customerID(self):
"""
Description:
Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_CUSTID(config) if same COLUMN_CUSTID(config), different stores
"""
cid = self.config["COLUMN_CUSTID"][0]
sid = self.config["COLUMN_STOREID"][0]
outlet = self.config["COLUMN_OUTLET"][0]
self.df[cid] = self.df[sid].astype(str) + "_" + self.df[outlet].astype(str) + "_" + self.df[cid].astype(str)
return self.df
def remove_null_compulsory_data(self):
"""
Description:
Removes rows where data is null but schema does not allow nullable
and is compulsory as per COLUMNS_NULL_COMPULSORY(config)
"""
for col in self.config["COLUMNS_NULL_COMPULSORY"]:
self.df.drop(
self.df[
self.df[col].map(len) < 1
].index,
inplace=True
)
self.df.drop(
self.df[
self.df[col] == "nan"
].index,
inplace=True
)
self.df.drop(
self.df[
self.df[col] == " "
].index,
inplace=True
)
return self.df
Hi All ,
I having warning in this code . can anyone check and can give some solution for ETL code.
Thanks in advance
karthik