February 26, 2025 •
10 min readIn database designing, two opposing approaches often cause debate: normalization and denormalization. Both have their place in modern database systems, but knowing when to use each is important for optimal performance and maintainability.
Normalization is a database design technique that reduces data redundancy and dependency by organizing fields and tables in a way that minimizes duplication. The process involves dividing large tables into smaller ones and defining relationships between them.
Denormalization is the process of intentionally adding redundancy to a database design by combining tables or adding redundant data. It's essentially the opposite of normalization, introducing duplication to improve read performance.
In a normalized design, we'd have separate tables for products, categories, and attributes:
-- Products table
CREATE TABLE products (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
price DECIMAL(10, 2),
category_id INT,
FOREIGN KEY (category_id) REFERENCES categories(category_id)
);
-- Categories table
CREATE TABLE categories (
category_id INT PRIMARY KEY,
category_name VARCHAR(50),
parent_category_id INT,
FOREIGN KEY (parent_category_id) REFERENCES categories(category_id)
);
-- Product attributes
CREATE TABLE product_attributes (
product_id INT,
attribute_id INT,
attribute_value VARCHAR(100),
PRIMARY KEY (product_id, attribute_id),
FOREIGN KEY (product_id) REFERENCES products(product_id),
FOREIGN KEY (attribute_id) REFERENCES attributes(attribute_id)
);
-- Attributes table
CREATE TABLE attributes (
attribute_id INT PRIMARY KEY,
attribute_name VARCHAR(50)
);
A query to get product details with category and attributes would be:
SELECT p.product_id, p.product_name, p.price, c.category_name,
a.attribute_name, pa.attribute_value
FROM products p
JOIN categories c ON p.category_id = c.category_id
JOIN product_attributes pa ON p.product_id = pa.product_id
JOIN attributes a ON pa.attribute_id = a.attribute_id
WHERE p.product_id = 123;
For a product catalog with millions of products where read performance is critical:
CREATE TABLE product_catalog (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
price DECIMAL(10, 2),
category_id INT,
category_name VARCHAR(50),
parent_category_name VARCHAR(50),
color VARCHAR(30),
size VARCHAR(10),
weight DECIMAL(6, 2),
material VARCHAR(50),
average_rating DECIMAL(3, 2),
num_reviews INT
);
The query becomes much simpler:
SELECT * FROM product_catalog WHERE product_id = 123;
-- Users table
CREATE TABLE users (
user_id INT PRIMARY KEY,
username VARCHAR(50),
profile_pic_url VARCHAR(255)
);
-- Posts table
CREATE TABLE posts (
post_id INT PRIMARY KEY,
user_id INT,
content TEXT,
created_at TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES users(user_id)
);
-- Likes table
CREATE TABLE likes (
user_id INT,
post_id INT,
created_at TIMESTAMP,
PRIMARY KEY (user_id, post_id),
FOREIGN KEY (user_id) REFERENCES users(user_id),
FOREIGN KEY (post_id) REFERENCES posts(post_id)
);
Query to generate a news feed:
SELECT p.post_id, p.content, p.created_at,
u.username, u.profile_pic_url,
COUNT(l.user_id) AS like_count
FROM posts p
JOIN users u ON p.user_id = u.user_id
LEFT JOIN likes l ON p.post_id = l.post_id
WHERE p.user_id IN (
SELECT followed_id
FROM follows
WHERE follower_id = 42
)
GROUP BY p.post_id, p.content, p.created_at, u.username, u.profile_pic_url
ORDER BY p.created_at DESC
LIMIT 20;
CREATE TABLE news_feed_items (
item_id INT PRIMARY KEY,
post_id INT,
user_id INT,
username VARCHAR(50),
profile_pic_url VARCHAR(255),
content TEXT,
created_at TIMESTAMP,
like_count INT,
comment_count INT,
has_media BOOLEAN,
media_type VARCHAR(20)
);
Query becomes:
SELECT * FROM news_feed_items
WHERE user_id IN (
SELECT followed_id
FROM follows
WHERE follower_id = 42
)
ORDER BY created_at DESC
LIMIT 20;
-- Accounts table
CREATE TABLE accounts (
account_id INT PRIMARY KEY,
customer_id INT,
account_type VARCHAR(20),
balance DECIMAL(12, 2),
FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);
-- Transactions table
CREATE TABLE transactions (
transaction_id INT PRIMARY KEY,
account_id INT,
transaction_type VARCHAR(20),
amount DECIMAL(10, 2),
transaction_date TIMESTAMP,
category_id INT,
FOREIGN KEY (account_id) REFERENCES accounts(account_id),
FOREIGN KEY (category_id) REFERENCES categories(category_id)
);
-- Categories table
CREATE TABLE categories (
category_id INT PRIMARY KEY,
category_name VARCHAR(50)
);
Monthly spending report query:
SELECT c.category_name, SUM(t.amount) as total_spent
FROM transactions t
JOIN categories c ON t.category_id = c.category_id
WHERE t.account_id = 456
AND t.transaction_type = 'debit'
AND t.transaction_date BETWEEN '2025-01-01' AND '2025-01-31'
GROUP BY c.category_name
ORDER BY total_spent DESC;
CREATE MATERIALIZED VIEW monthly_spending_summary AS
SELECT
account_id,
EXTRACT(YEAR FROM transaction_date) AS year,
EXTRACT(MONTH FROM transaction_date) AS month,
category_id,
category_name,
SUM(CASE WHEN transaction_type = 'debit' THEN amount ELSE 0 END) AS total_debit,
SUM(CASE WHEN transaction_type = 'credit' THEN amount ELSE 0 END) AS total_credit,
COUNT(*) AS transaction_count
FROM transactions t
JOIN categories c ON t.category_id = c.category_id
GROUP BY account_id, EXTRACT(YEAR FROM transaction_date),
EXTRACT(MONTH FROM transaction_date), category_id, category_name;
The query becomes much faster:
SELECT category_name, total_debit as total_spent
FROM monthly_spending_summary
WHERE account_id = 456
AND year = 2025
AND month = 1
ORDER BY total_spent DESC;
// Products collection
db.createCollection("products", {
validator: {
$jsonSchema: {
bsonType: "object",
required: ["name", "price", "category_id"],
properties: {
name: { bsonType: "string" },
price: { bsonType: "decimal" },
category_id: { bsonType: "objectId" }
}
}
}
});
// Categories collection
db.createCollection("categories", {
validator: {
$jsonSchema: {
bsonType: "object",
required: ["name"],
properties: {
name: { bsonType: "string" },
parent_id: { bsonType: "objectId" }
}
}
}
});
// Product attributes collection
db.createCollection("product_attributes", {
validator: {
$jsonSchema: {
bsonType: "object",
required: ["product_id", "attribute_name", "value"],
properties: {
product_id: { bsonType: "objectId" },
attribute_name: { bsonType: "string" },
value: { bsonType: "string" }
}
}
}
});
Query with aggregation:
db.products.aggregate([
{ $match: { _id: ObjectId("123") } },
{ $lookup: {
from: "categories",
localField: "category_id",
foreignField: "_id",
as: "category"
}},
{ $lookup: {
from: "product_attributes",
localField: "_id",
foreignField: "product_id",
as: "attributes"
}},
{ $unwind: "$category" },
{ $project: {
_id: 1,
name: 1,
price: 1,
category_name: "$category.name",
attributes: 1
}}
]);
db.createCollection("product_catalog", {
validator: {
$jsonSchema: {
bsonType: "object",
required: ["name", "price"],
properties: {
name: { bsonType: "string" },
price: { bsonType: "decimal" },
category: {
bsonType: "object",
properties: {
id: { bsonType: "objectId" },
name: { bsonType: "string" },
parent: {
bsonType: "object",
properties: {
id: { bsonType: "objectId" },
name: { bsonType: "string" }
}
}
}
},
attributes: {
bsonType: "object"
},
average_rating: { bsonType: "decimal" },
review_count: { bsonType: "int" }
}
}
}
});
Example document:
{
"_id": ObjectId("123"),
"name": "Ergonomic Office Chair",
"price": 299.99,
"category": {
"id": ObjectId("456"),
"name": "Office Furniture",
"parent": {
"id": ObjectId("789"),
"name": "Furniture"
}
},
"attributes": {
"color": "Black",
"material": "Mesh",
"adjustable": true,
"weight_capacity": "300 lbs",
"dimensions": "26 x 22 x 39 inches"
},
"average_rating": 4.7,
"review_count": 128
}
Query becomes trivial:
db.product_catalog.findOne({ _id: ObjectId("123") });
Normalization shines in the following scenarios:
OLTP (Online Transaction Processing) Systems
Systems with Unpredictable Query Patterns
Development Phase
When Storage is Limited
Multi-User Environments with Concurrent Updates
Denormalization becomes valuable in these circumstances:
OLAP (Online Analytical Processing) Systems
Performance-Critical Applications
Time-Series Data
Real-Time Analytics
Distributed Databases
In modern database design, the choice isn't binary. Many systems benefit from a hybrid approach:
-- Normalized transactional tables for order processing
CREATE TABLE customers (
customer_id INT PRIMARY KEY,
first_name VARCHAR(50),
last_name VARCHAR(50),
email VARCHAR(100) UNIQUE
);
CREATE TABLE orders (
order_id INT PRIMARY KEY,
customer_id INT,
order_date TIMESTAMP,
status VARCHAR(20),
FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);
CREATE TABLE order_items (
order_id INT,
product_id INT,
quantity INT,
price_at_purchase DECIMAL(10, 2),
PRIMARY KEY (order_id, product_id),
FOREIGN KEY (order_id) REFERENCES orders(order_id),
FOREIGN KEY (product_id) REFERENCES products(product_id)
);
-- Denormalized tables for product browsing and search
CREATE TABLE product_catalog (
product_id INT PRIMARY KEY,
name VARCHAR(100),
description TEXT,
price DECIMAL(10, 2),
category_name VARCHAR(50),
brand VARCHAR(50),
avg_rating DECIMAL(3, 2),
stock_level INT,
search_keywords TEXT
);
-- Materialized view for sales reporting
CREATE MATERIALIZED VIEW daily_sales_summary AS
SELECT
CAST(order_date AS DATE) AS sale_date,
p.category_id,
c.category_name,
COUNT(DISTINCT o.order_id) AS num_orders,
SUM(oi.quantity) AS items_sold,
SUM(oi.quantity * oi.price_at_purchase) AS revenue
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
JOIN categories c ON p.category_id = c.category_id
WHERE o.status = 'completed'
GROUP BY CAST(order_date AS DATE), p.category_id, c.category_name;
Ask yourself these questions when deciding between normalization and denormalization:
Neither normalization nor denormalization is universally superior. The right approach depends on your specific use case, workload characteristics, and performance requirements. Modern database systems often employ both strategies, normalizing for data integrity while denormalizing for performance.
The code examples above demonstrate how these principles translate into practical implementations. In many real-world systems, you'll find a mix of normalized structures for transactional data and denormalized structures for reporting and analysis.
The database design that serves your application's needs best will likely be somewhere on the spectrum between fully normalized and heavily denormalized. Finding that sweet spot is the art of database architecture.