I made this quick project to demonstrate my proficiency in SQL, focusing on data cleaning, transformation, and exploratory data analysis using a dataset containing information about tech companies layoffs.
The project is divided into two main sections:
SELECT *
FROM layoffs;
Layoffs dataset
-- Create a duplicate table to keep the raw data if needed
CREATE TABLE layoffs_staging LIKE layoffs;
-- Check for duplicates
WITH duplicate_cte AS (
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off,
percentage_laid_off, `date`, stage, country, funds_raised_millions
ORDER BY company) AS row_num
FROM layoffs
)
SELECT *
FROM duplicate_cte
WHERE row_num > 1; # Return only duplicates
-- 1. Remove Duplicates
INSERT INTO layoffs_staging
SELECT DISTINCT *
FROM layoffs;
-- Check again for duplicates in new table
WITH duplicate_cte AS (
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions
ORDER BY company) AS row_num
FROM layoffs_staging
)
SELECT *
FROM duplicate_cte
WHERE row_num > 1; # Return
-- Trim spaces in 'company' and 'country' columns
UPDATE layoffs_staging
SET company = TRIM(company),
country = TRIM(TRAILING '.' FROM country);
-- Standardize 'industry' column
UPDATE layoffs_staging
SET industry = 'Crypto'
WHERE industry LIKE 'Crypto%';
-- Convert 'date' from text to DATE type
UPDATE layoffs_staging
SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y');
ALTER TABLE layoffs_staging
MODIFY COLUMN `date` DATE;
Regroup spelling errors
Regroup similar categories
-- 3. Handle Null and Blank Values
-- Update blank industry to NULL
UPDATE layoffs_staging
SET industry = NULL
WHERE industry = '';
-- Fill missing industry values from same company and location
UPDATE layoffs_staging t1
JOIN (SELECT company, location, industry
FROM layoffs_staging
WHERE industry IS NOT NULL) t2
ON t1.company = t2.company AND t1.location = t2.location
SET t1.industry = t2.industry
WHERE t1.industry IS NULL;
-- Remove rows with both 'total_laid_off' and 'percentage_laid_off' NULL
DELETE FROM layoffs_staging
WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;
-- Check for remaining NULLs if needed
SELECT *
FROM layoffs_staging
WHERE industry IS NULL OR total_laid_off IS NULL OR percentage_laid_off IS NULL;