diff --git a/. bundle install/_config.yml b/. bundle install/_config.yml
index ef7ba7c..0bce26a 100644
--- a/. bundle install/_config.yml
+++ b/. bundle install/_config.yml
@@ -1,55 +1,75 @@
-# Welcome to Jekyll!
-#
-# This config file is meant for settings that affect your whole blog, values
-# which you are expected to set up once and rarely edit after that. If you find
-# yourself editing this file very often, consider using Jekyll's data files
-# feature for the data you need to update frequently.
-#
-# For technical reasons, this file is *NOT* reloaded automatically when you use
-# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
-#
-# If you need help with YAML syntax, here are some quick references for you:
-# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml
-# https://learnxinyminutes.com/docs/yaml/
-#
-# Site settings
-# These are used to personalize your new site. If you look in the HTML files,
-# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
-# You can create any custom variable you would like, and they will be accessible
-# in the templates via {{ site.myvariable }}.
-
-title: Your awesome title
-email: your-email@example.com
-description: >- # this means to ignore newlines until "baseurl:"
- Write an awesome description for your new site here. You can edit this
- line in _config.yml. It will appear in your document head meta (for
- Google search results) and in your feed.xml site description.
-baseurl: "" # the subpath of your site, e.g. /blog
-url: "" # the base hostname & protocol for your site, e.g. http://example.com
-twitter_username: jekyllrb
-github_username: jekyll
-
-# Build settings
-theme: minima
+title: COGS 108
+email: jfleischer@ucsd.edu
+description: >-
+ COGS 108, Spring 2026 at UC San Diego
+tagline: Data Science in Practice
+
+url: "https://COGS108.github.io"
+baseurl: "/LectureNotes"
+
+remote_theme: just-the-docs/just-the-docs
+
plugins:
+ - jekyll-remote-theme
- jekyll-feed
-# Exclude from processing.
-# The following items will not be processed, by default.
-# Any item listed under the `exclude:` key here will be automatically added to
-# the internal "default list".
-#
-# Excluded items can be processed by explicitly listing the directories or
-# their entries' file path in the `include:` list.
-#
-# exclude:
-# - .sass-cache/
-# - .jekyll-cache/
-# - gemfiles/
-# - Gemfile
-# - Gemfile.lock
-# - node_modules/
-# - vendor/bundle/
-# - vendor/cache/
-# - vendor/gems/
-# - vendor/ruby/
+search_enabled: true
+
+markdown: kramdown
+kramdown:
+ math_engine: mathjax
+ input: GFM
+ syntax_highlighter: rouge
+
+exclude:
+ - .sass-cache/
+ - .jekyll-cache/
+ - gemfiles/
+ - Gemfile
+ - Gemfile.lock
+ - node_modules/
+ - vendor/bundle/
+ - vendor/cache/
+ - vendor/gems/
+ - vendor/ruby/
+ - .data/
+ - docs
+ - dev-docs
+
+aux_links:
+ "🙋 Piazza": "https://piazza.com"
+ "💯 Gradescope": "https://www.gradescope.com"
+ "💪 Practice": "https://practice.dsc40a.com"
+
+aux_links_new_tab: true
+
+data_folder: sp26
+
+color_scheme: dark
+
+callouts_level: quiet
+callouts:
+ highlight:
+ title: Caution
+ color: yellow
+ important:
+ title: Important
+ color: blue
+ new:
+ title: New
+ color: green
+ note:
+ title: Note
+ color: purple
+ warning:
+ title: Warning
+ color: red
+
+back_to_top: true
+back_to_top_text: "Back to top"
+
+collections:
+ demos:
+ output: true
+ permalink: /demos/:path
+
\ No newline at end of file
diff --git a/.github/workflows/jekyll.yml b/.github/workflows/jekyll.yml
index cffca26..2683eb4 100644
--- a/.github/workflows/jekyll.yml
+++ b/.github/workflows/jekyll.yml
@@ -1,7 +1,7 @@
name: Deploy Jekyll site to Pages
on:
push:
- branches: ["main"]
+ branches: ["main", "fix-github-pages-deploy"]
workflow_dispatch:
permissions:
@@ -25,7 +25,7 @@ jobs:
- uses: actions/configure-pages@v4
id: pages
- name: Build with Jekyll
- run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}"
+ run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" --destination ./docs
env:
JEKYLL_ENV: production
- uses: actions/upload-pages-artifact@v3
diff --git a/Gemfile.lock b/Gemfile.lock
index 725f171..6ab5db8 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -48,6 +48,7 @@ GEM
net-http (~> 0.5)
ffi (1.17.4-aarch64-linux-gnu)
ffi (1.17.4-arm64-darwin)
+ ffi (1.17.4-x64-mingw-ucrt)
ffi (1.17.4-x86_64-linux-gnu)
ffi (1.17.4-x86_64-linux-musl)
forwardable-extended (2.6.0)
@@ -245,6 +246,8 @@ GEM
racc (~> 1.4)
nokogiri (1.19.2-arm64-darwin)
racc (~> 1.4)
+ nokogiri (1.19.2-x64-mingw-ucrt)
+ racc (~> 1.4)
nokogiri (1.19.2-x86_64-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.2-x86_64-linux-musl)
@@ -287,6 +290,7 @@ PLATFORMS
arm64-darwin-21
arm64-darwin-22
arm64-darwin-24
+ x64-mingw-ucrt
x86_64-linux
x86_64-linux-musl
diff --git a/README.md b/README.md
index 89a58b7..e7e7320 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-[](https://github.com/kshannon-ucsd/dsc40a/actions/workflows/jekyll.yml)
+[](https://github.com/COGS108/LectureNotes/actions/workflows/jekyll.yml)
-# DSC40a
+# COGS 108
Class Website for UCSD's Data Science in Practice
@@ -29,21 +29,21 @@ Class Website for UCSD's Data Science in Practice
### Building for Local
All dev builds will be built into a `dev-docs/` dir. This dir is included in `.gitignore` and is only used for development builds. Any specific dev config should bd included in the `_config_dev.yml` file. Your absolute path to `dev-docs` will be different.
1. `bundle exec jekyll serve --config _config.yml,_config_dev.yml --destination ./dev-docs` #builds and starts local server on localhost
-2. navigate to http://localhost:4000/dsc40a/
+2. navigate to http://localhost:4000/cogs108/
### Building for Prod
1. A merge or direct push to main branch will automatically trigger the jekyll workflow which will build and deploy the main branch to production.
2. Check status in github actions for build results.
-3. Navigate to https://kshannon-ucsd.github.io/dsc40a/
-4. 4. if building with HDSI's domain dsc40a.com add the `CNAME` file with the url entry `dsc40a.com` into the root
+3. Navigate to https://COGS108.github.io/LectureNotes/
+4. If using a custom domain, add the `CNAME` file with the domain into the root
### /data/ directory
This dir is excluded from the jekyll build chain (excluded in the config.yaml file).
-Set this up correctly fopr the next iteration:
+Set this up correctly for the next iteration:
`Change this to choose where to pick data from`
`data_folder: su24` becomes e.g. wi25
-This folder includes data that changes from quarter to quarter, e.g. course calender. I don't want to recreate these data, so I dump them here to use for later. Eventually I would like to create variables to pull this data, but for now this is a fine solution.
+This folder includes data that changes from quarter to quarter, e.g. course calendar. This allows reuse without recreating data each term. Eventually this can be abstracted into variables.
## Course Calendar
The following tags can be applied within a row for each course activity, set within the `_data/term/course_calendar.csv` file.
@@ -55,5 +55,4 @@ The following tags can be applied within a row for each course activity, set wit
- GRPW --> groupwork
- PRAC --> practice problems
- Any other tag used will default to a black colored tag
-
+Any other tag used will default to a black colored tag
\ No newline at end of file
diff --git a/_data/demos.yml b/_data/demos.yml
deleted file mode 100644
index cb94305..0000000
--- a/_data/demos.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-- title: 1. Parameter and Feature Space
- url: demos/1_parameter_feature_space
- photo: /assets/images/demos/feature_param_space.png
- description: This interactive demo explores the relationship between feature and parameter spaces in regression models. By manipulating model parameters, you can visualize how changes affect both the model fit in feature space and the corresponding position in parameter space, gaining intuition about why squared error creates nicely behaved optimization surfaces with global minima.
-
-- title: 2. Linear Regression
- url: demos/2_linear_regression
- photo: /assets/images/demos/simple-loss.jpg
- description: This is a simple demo showing how loss and the function you are trying to learn with linear regression work together. Recently rebuilt in JavaScript, but I first built this in Python within a jupyter notebook. You can download my notebook below to dive deeper!
- links:
- - url: https://ucsd.s3.us-west-2.amazonaws.com/dsc40a/demos/demo_01.ipynb
- text: Python Notebook
-
-- title: 3. Loss Surface Navigation
- url: demos/3_loss_surfaces
- photo: /assets/images/demos/loss-surface.png
- description: This demo shows how gradient descent moves through various loss surfaces in 3d and 2d contour maps. It also let's you compare adam to plain gradient descent!
-
-- title: 4. Huber Loss
- url: demos/4_huber_loss
- photo: /assets/images/demos/huber.png
- description: This interactive demo showcases how the Huber loss improves regression robustness in the presence of outliers. By adjusting polynomial degree, noise, and the Huber threshold (δ), users can see how the loss function shifts from squared error for small deviations to absolute error for larger ones. The demo compares traditional least-squares with Huber regression—illustrating model fit, error distributions, and learning curves—to highlight the balance between sensitivity to noise and outlier resistance, ultimately yielding more generalizable models.
-
-- title: 5. Interaction Terms
- url: demos/5_interaction_terms
- photo: /assets/images/demos/interaction_terms.png
- description: This demo mathematically compares additive versus multiplicative interaction models using a clinical trial dataset as an example, focusing on how design matrices and linear independence impact the model's identifiability. In the additive case, the interaction term is merely a linear combination of the main effects—resulting in parallel lines—whereas the multiplicative model introduces a linearly independent term that allows for different slopes between groups, yielding unique solutions.
-
-- title: 6. Overfitting and Regularization
- url: demos/6_regularization_demo
- photo: /assets/images/demos/overfitting-model.png
- description: This interactive demo visualizes how regularization techniques control model complexity in polynomial regression. By adjusting polynomial degree, regularization strength (λ), and noise levels, users can observe firsthand how Ridge (L2) and Lasso (L1) regularization prevent overfitting by constraining coefficient magnitudes. The visualization reveals the bias-variance tradeoff through side-by-side comparisons of model fit, coefficient distributions, and learning curves, demonstrating how proper regularization leads to smoother models that generalize better despite sacrificing training accuracy.
-
-- title: 7. Multi-Armed Bandit
- url: demos/7_multi_armed_bandit
- photo: /assets/images/demos/multiarmedbandit.jpg
- description: This interactive multiarmed bandit demo showcases the thrilling challenge of balancing exploration and exploitation as you try your luck on slot machines with hidden success rates. It dynamically updates your winning estimates using Bayesian methods, urging you to strategically alternate between testing new machines and capitalizing on proven performers to maximize your overall reward.
-
-- title: 8. K-Means Clustering
- url: demos/8_kmeans_clustering
- photo: /assets/images/demos/kmeans.png
- description: This interactive K-means clustering demo illustrates the iterative process of partitioning data points by assigning each to the nearest centroid based on squared Euclidean distance, then updating centroids to minimize the overall Sum of Squared Errors (SSE). It further employs the elbow method to pinpoint the optimal number of clusters by identifying the stage where adding extra clusters yields diminishing improvements in error reduction.
\ No newline at end of file
diff --git a/docs/assets/images/demos/construction.png b/docs/assets/images/demos/construction.png
deleted file mode 100644
index ae412bd..0000000
Binary files a/docs/assets/images/demos/construction.png and /dev/null differ
diff --git a/docs/assets/images/demos/feature_param_space.png b/docs/assets/images/demos/feature_param_space.png
deleted file mode 100644
index cb18d38..0000000
Binary files a/docs/assets/images/demos/feature_param_space.png and /dev/null differ
diff --git a/docs/assets/images/demos/huber.png b/docs/assets/images/demos/huber.png
deleted file mode 100644
index b034690..0000000
Binary files a/docs/assets/images/demos/huber.png and /dev/null differ
diff --git a/docs/assets/images/demos/interaction_terms.png b/docs/assets/images/demos/interaction_terms.png
deleted file mode 100644
index ea6ab9c..0000000
Binary files a/docs/assets/images/demos/interaction_terms.png and /dev/null differ
diff --git a/docs/assets/images/demos/kmeans.png b/docs/assets/images/demos/kmeans.png
deleted file mode 100644
index c18df08..0000000
Binary files a/docs/assets/images/demos/kmeans.png and /dev/null differ
diff --git a/docs/assets/images/demos/loss-surface.png b/docs/assets/images/demos/loss-surface.png
deleted file mode 100644
index a05f7bf..0000000
Binary files a/docs/assets/images/demos/loss-surface.png and /dev/null differ
diff --git a/docs/assets/images/demos/multiarmedbandit.jpg b/docs/assets/images/demos/multiarmedbandit.jpg
deleted file mode 100644
index c179e45..0000000
Binary files a/docs/assets/images/demos/multiarmedbandit.jpg and /dev/null differ
diff --git a/docs/assets/images/demos/overfitting-model.png b/docs/assets/images/demos/overfitting-model.png
deleted file mode 100644
index e3ce6d9..0000000
Binary files a/docs/assets/images/demos/overfitting-model.png and /dev/null differ
diff --git a/docs/assets/images/demos/simple-loss.jpg b/docs/assets/images/demos/simple-loss.jpg
deleted file mode 100644
index b721820..0000000
Binary files a/docs/assets/images/demos/simple-loss.jpg and /dev/null differ
diff --git a/docs/assets/images/faq2-mad.png b/docs/assets/images/faq2-mad.png
deleted file mode 100644
index fd5e3fa..0000000
Binary files a/docs/assets/images/faq2-mad.png and /dev/null differ
diff --git a/docs/assets/images/faq2-pros-cons.png b/docs/assets/images/faq2-pros-cons.png
deleted file mode 100644
index 47045de..0000000
Binary files a/docs/assets/images/faq2-pros-cons.png and /dev/null differ
diff --git a/docs/assets/images/faq2-surface.png b/docs/assets/images/faq2-surface.png
deleted file mode 100644
index 6f7ef04..0000000
Binary files a/docs/assets/images/faq2-surface.png and /dev/null differ
diff --git a/docs/assets/images/faq2-unique.png b/docs/assets/images/faq2-unique.png
deleted file mode 100644
index 9be03cd..0000000
Binary files a/docs/assets/images/faq2-unique.png and /dev/null differ
diff --git a/docs/assets/images/just-the-docs.png b/docs/assets/images/just-the-docs.png
deleted file mode 100644
index 81c3306..0000000
Binary files a/docs/assets/images/just-the-docs.png and /dev/null differ
diff --git a/docs/assets/images/large-image.jpg b/docs/assets/images/large-image.jpg
deleted file mode 100644
index c007781..0000000
Binary files a/docs/assets/images/large-image.jpg and /dev/null differ
diff --git a/docs/assets/images/search.svg b/docs/assets/images/search.svg
deleted file mode 100644
index 421ca4d..0000000
--- a/docs/assets/images/search.svg
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/docs/assets/images/small-image.jpg b/docs/assets/images/small-image.jpg
deleted file mode 100644
index 5bf58a9..0000000
Binary files a/docs/assets/images/small-image.jpg and /dev/null differ
diff --git a/docs/demos/1_parameter_feature_space.html b/docs/demos/1_parameter_feature_space.html
deleted file mode 100644
index 8e04aba..0000000
--- a/docs/demos/1_parameter_feature_space.html
+++ /dev/null
@@ -1,701 +0,0 @@
-
-
- This interactive demo helps you understand why we use mean squared error (or other averages) rather than
- simple sums of errors in regression problems. More importantly, it illustrates how feature space and parameter space are
- intrinsically connected through the concept of loss functions. You can interactively adjust model parameters and see how
- they affect both the model fit and the optimization landscape.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 0.00
-
-
-
-
-
-
-
-
-
- 0.00
-
-
-
-
- 0.00
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1.0
-
-
-
-
- 20
-
-
-
-
-
-
Error Metrics:
-
- Sum of Errors: 0.00
- Mean Error: 0.00
- Sum of Squared Errors: 0.00
- Mean Squared Error: 0.00
-
-
-
-
-
-
-
-
Feature Space
-
-
-
-
-
Parameter Space
-
-
-
-
-
-
-
Why Do We Use Average Errors Instead of Summing?
-
-
- When evaluating regression models, we typically use averaged error metrics
- (like Mean Squared Error) rather than simple sums of errors. Here's why:
-
-
-
-
Sum of Raw Errors Problem:
-
- For raw errors (predicted - actual), positive and negative errors cancel each other out:
-
- $$\sum_{i=1}^n (\hat{y}_i - y_i)$$
-
- A model could make huge errors in both directions, but still have a near-zero sum!
-
-
-
Squared or Absolute Errors:
-
- We often use squared errors or absolute errors to make all errors positive:
-
- $$\text{Squared Error: } \sum_{i=1}^n (\hat{y}_i - y_i)^2 \quad \text{or} \quad
- \text{Absolute Error: } \sum_{i=1}^n |\hat{y}_i - y_i|$$
-
-
-
Why Average (Mean) Instead of Sum?
-
-
-
Scale Invariance: The error sum grows with dataset size, making it difficult to compare models across different-sized datasets.
-
Interpretation: Means are easier to interpret - they represent the typical error per data point.
-
Mathematical Properties: For gradient-based optimization, using means introduces a constant factor (1/n) that doesn't change the optimization landscape's shape.
-
-
-
-
-
-
Parameter Space vs. Feature Space
-
- In machine learning, we work in two complementary spaces that are connected through our loss function:
-
-
-
- Feature Space: This is where your data lives and where you visualize your model's predictions.
- For regression, this typically shows input features (x-axis) and output values (y-axis). Each point represents
- an observation, and the line/curve shows your model's predictions across the input domain.
-
-
-
- Parameter Space: This is the space of all possible parameter values for your model.
-
-
For a constant model: 1-dimensional space (just the constant c)
-
For a linear model: 2-dimensional space (slope m and intercept b)
-
For more complex models: Higher dimensional spaces
-
-
-
-
- The error surface in parameter space shows how the error changes as you vary the model parameters.
- Each point in parameter space corresponds to a specific model configuration in feature space.
- The key insight is that certain loss functions (like squared error) create nice, convex error surfaces
- with a single global minimum, making optimization straightforward.
-
-
-
-
-
Special Case: Raw Error Loss Landscapes
-
- When you select "Raw Error" as your error metric, you'll observe some interesting behavior:
-
-
-
- For the constant model: The parameter space shows a straight line rather than a curve.
- This happens because the raw error function for a constant model is:
-
- $$\frac{1}{n}\sum_{i=1}^n (c - y_i) = c - \frac{1}{n}\sum_{i=1}^n y_i$$
-
- This is a linear function of the parameter c, creating a straight line in parameter space. The line
- crosses zero at c = mean(y), which is why the minimum raw error occurs at the mean of y values.
-
-
-
- For the linear model: The parameter space shows a flat plane with a line where the error is zero.
- This happens because the raw error function for a linear model is:
-
- $$\frac{1}{n}\sum_{i=1}^n (mx_i + b - y_i) = m\cdot\text{mean}(x) + b - \text{mean}(y)$$
-
- This creates a flat plane in the (m,b) space. Any combination of m and b that satisfies
- m·mean(x) + b = mean(y) will have zero raw error! This is why raw error is problematic for
- optimization - it doesn't provide a unique solution for the model parameters.
-
-
-
-
- Try it yourself: Experiment with the sliders to see how changing parameters affects
- both spaces simultaneously. Notice how squared error creates a bowl-shaped (paraboloid) surface in parameter space,
- while absolute error creates a more angular surface, and raw error can create complex surfaces with no clear minimum.
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/demos/2_linear_regression.html b/docs/demos/2_linear_regression.html
deleted file mode 100644
index b34daae..0000000
--- a/docs/demos/2_linear_regression.html
+++ /dev/null
@@ -1,1503 +0,0 @@
-
-
-
-
-
- Interactive Linear Regression
-
-
-
-
-
-
-
-
-
-
Linear Regression: MSE vs MAE Comparison
-
-
-
This interactive demo helps you understand how linear regression works with different loss functions. Adjust the sliders to modify the regression line and observe how it affects both MSE (Mean Squared Error) and MAE (Mean Absolute Error) in real-time.
-
Interactive Elements in This Demo
-
-
Adjust the slope and intercept sliders to see how your regression line changes
-
Generate random data with different sizes and noise levels
-
Add outliers to see how they affect each loss function differently
-
Try different example datasets to explore various data patterns
-
Click directly on the plot to add points; double-click to remove them
-
-
-
-
-
-
Interactive Regression Model
-
-
-
-
-
Line Parameters
-
-
-
-
- 1.0
-
-
-
-
-
- 0.0
-
-
-
- Your Line: ŷ = 0.0 + 1.0x
-
-
-
-
-
-
-
Data Controls
-
-
-
-
- 15
-
-
-
-
-
- 1.0
-
-
-
-
-
-
-
-
-
-
-
-
-
- Example Datasets:
-
Linear Trend
-
With Outliers
-
Clustered Data
-
-
-
-
-
-
-
-
-
-
-
-
- MSE Best Fit:ŷ = 0.0 + 0.0x
-
-
- MSE Value:
- 0.00
-
-
-
-
-
- MAE Best Fit:ŷ = 0.0 + 0.0x
-
-
- MAE Value:
- 0.00
-
-
-
-
-
-
-
-
Loss Functions Comparison
-
-
-
-
-
- MSE Loss: Quadratic, smooth curve that heavily penalizes large errors
-
-
-
-
-
-
- MAE Loss: V-shaped, linear penalty that treats all error sizes equally
-
-
-
-
-
-
-
-
Geometric Interpretation of Residuals
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Parameter Space Comparison
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Connection to Parameter Space
-
As seen in Demo 1, every point in parameter space (w₀, w₁) corresponds to a line in feature space. The optimal parameters are found at the lowest point of the loss surface.
-
-
Each point on the parameter space surfaces represents a possible model with specific slope and intercept values. The height of the surface shows the loss value for that model. The MSE surface is smooth and bowl-shaped with a unique minimum, while the MAE surface has sharper edges and can have multiple optimal solutions along a line.
-
-
Summary and Explanations
-
-
Linear Regression Model
-
Linear regression finds a linear relationship between variables: \( \hat{y} = w_0 + w_1 x \), where \(w_0\) is the intercept and \(w_1\) is the slope.
MAE uses the absolute value of errors, treating all error magnitudes more uniformly. It produces a more angular loss surface and is more robust to outliers.
Residuals Plot: Shows how each loss function measures error. MSE creates areas (squares) while MAE uses lengths (line segments).
-
Loss Curves: Demonstrate how the loss changes as you adjust the slope parameter. Notice the smooth parabola for MSE versus the V-shape for MAE.
-
Parameter Space: 3D visualizations of the loss surface over all possible combinations of slope and intercept. MSE creates a smooth bowl, while MAE creates a more angular surface.
-
-
-
Key Differences
-
-
-
Aspect
-
MSE
-
MAE
-
-
-
Sensitivity to Outliers
-
High (squares errors)
-
Low (linear penalty)
-
-
-
Loss Surface
-
Smooth, differentiable everywhere
-
Angular, not differentiable at zero error
-
-
-
Computational Complexity
-
Simple closed-form solution
-
Requires median calculations
-
-
-
Optimal Solution
-
Mean-centered
-
Median-centered
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/demos/3_loss_surfaces.html b/docs/demos/3_loss_surfaces.html
deleted file mode 100644
index db22650..0000000
--- a/docs/demos/3_loss_surfaces.html
+++ /dev/null
@@ -1,825 +0,0 @@
-
-
-
- Interactive Gradient Descent
-
-
-
-
-
-
-
-
-
Interactive Gradient Descent
-
- Explore how gradient descent navigates loss surfaces. Try the pre-set scenarios below
- to see different behaviors: divergence, getting trapped in local minima, escaping saddle points,
- and smooth convergence. Toggle "Compare: Adam" to see how adaptive optimizers differ.
-
-
-
-
-
-
-
-
-
-
-
-
- :
-
-
-
-
-
-
-
-
-
- 3.00
-
-
-
-
-
- 2.00
-
-
-
-
-
- 3.2e-2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- GD
- Adam
-
-
-
-
-
-
-
-
-
-
- Epoch: 0
- Loss: --
-
-
-
-
-
-
-
-
-
3D Surface
-
-
-
-
-
2D Contour
-
-
-
-
-
-
-
Loss vs. Epoch
-
-
-
-
-
-
-
-
-
Adam vs. Plain Gradient Descent
-
- In plain gradient descent, each update subtracts the gradient scaled by a
- single learning rate \(\alpha\):
-
- $$
- \theta_{t+1} \;=\; \theta_t \;-\; \alpha \,\nabla_\theta J(\theta_t)
- $$
-
-
-
-
\(\theta_t\): The parameter vector (e.g., the weights of your model) at iteration \(t\).
-
\(\alpha\): The learning rate or step size.
-
\(\nabla_\theta J(\theta_t)\): The gradient of the loss function with respect to the parameters at iteration \(t\). This gradient indicates the direction in which \(J\) increases most rapidly.
-
\(J(\theta_t)\): The loss (or cost) function evaluated at \(\theta_t\).
-
-
-
-
- What does Momentum add?
- Unlike in plain gradient descent, with Momentum, we introduce
- a “velocity” \(v_t\) that accumulates past gradients:
-
- $$
- v_t \;=\; \mu \,v_{t-1} \;+\; \alpha \,\nabla_{\theta}J(\theta_t),
- \quad
- \theta_{t+1} \;=\; \theta_t \;-\; v_t
- $$
-
-
-
\(v_t\) is the current velocity (a rolling average of gradients).
-
\(v_{t-1}\) is the velocity from the previous step.
-
\(\mu\) (often around 0.9) is the momentum factor, controlling how much
- of the past velocity we retain.
-
\(\alpha\) is now multiplied into the velocity rather than directly into \(\nabla_{\theta} J(\theta_t)\).
-
-
- This way, if gradients keep pointing in the same direction,
- \(v_t\) grows, speeding us along. If they fluctuate, \(v_t\) smooths out the noise
- by averaging recent steps instead of reacting only to the current gradient.
-
-
-
- What about RMSProp?
-
-
-
- It adapts the learning rate by tracking how big or small gradients typically are for
- each parameter. If gradients are large, RMSProp shrinks the step size; if they are small,
- it enlarges the step. In simpler terms:
-
- $$
- \text{Adaptive Step Size} \;\approx\; \frac{1}{\sqrt{\text{rolling average of }(\nabla_{\theta} J(\theta))^2}}
- $$
-
-
-
-
\(\nabla_\theta J(\theta_t)\): As above, this is the gradient of the loss function at the current parameters.
-
Rolling Average of Squared Gradients: A moving average that tracks the squared values of the gradients. This average is used to adapt the learning rate for each parameter dynamically.
-
-
-
- That means parameters with consistently big gradients slow down,
- while those with small gradients speed up.
-
-
-
-
- Adam (Adaptive Moment Estimation) extends this by tracking two moving averages
- of the gradients:
- 1. A “first moment” (numerator) which acts like momentum—an exponential average of
- gradients that can help smooth out noise.
- 2. A “second moment” comes from RMSProp (denominator) that tracks the average of squared gradients,
- adjusting the effective learning rate so steep directions get smaller updates.
-
- The update roughly looks like:
- $$
- \theta_{t+1}
- \;\approx\;
- \theta_t
- \;-\;
- \frac{\alpha \,\text{(avg gradient)}}{\sqrt{\text{(avg of gradient}^2)} + \epsilon}
- $$
-
- The numerator tells you which direction to move, the denominator tells you how much, and the learning rate sets the pace.
- Finally, \(\epsilon\) is a small constant (often around \(10^{-8}\)) to avoid dividing by
- zero and ensure stable updates. Adam automatically tunes step sizes for each
- parameter dimension. In practice (high dimmensional spaces), this often converges faster and is more tolerant
- of tricky or noisy gradients.
-
Rookie Evaluation: Young players may have extreme variance while developing
-
-
Example: A guard scores 50 points when the opponent's best defender is injured, but
- averages 18 PPG otherwise. Advanced Analytics prevents overvaluing this outlier while acknowledging scoring
- ability.
-
-
-
-
🏀 NBA Front Office Implementation
-
- When implementing Advanced Analytics in NBA front offices:
-
-
-
- Role-Based Evaluation: Different tolerance levels for stars vs. role players
-
-
- Recent Performance: Weight the last 20 games more heavily than early season
-
-
- Contextual Factors: Adjust for pace, opponent defensive rating, and lineup configurations
-
-
- Development Curve: Rookies and sophomores need different evaluation metrics than veterans
-
-
-
- Modern NBA front offices use these advanced statistical approaches to make better draft picks, trades, and
- free agent signings. By balancing the eye test with robust analytics, teams can identify undervalued players
- and avoid overpaying for statistical anomalies.
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/demos/5_interaction_terms.html b/docs/demos/5_interaction_terms.html
deleted file mode 100644
index 6cc1c9c..0000000
--- a/docs/demos/5_interaction_terms.html
+++ /dev/null
@@ -1,799 +0,0 @@
-
-
-
- Understanding Interaction Effects in Clinical Trials
-
-
-
-
-
-
-
-
-
-
-
Understanding Interaction Effects in Clinical Trials
-
-
-
-
-
-
-
-
-
-
-
-
Clinical Trial Scenario: AntiHyp Drug Study
-
- A clinical trial is studying a new antihypertensive medication (AntiHyp) across different patient groups.
- Researchers want to understand if the drug works differently for male versus female patients.
-
-
- Variables:
-
-
-
Treatment: Drug dosage level (standardized units, ranging from -2 to 2)
-
Group: Patient biological sex
-
Response (Y): Reduction in blood pressure (mm Hg)
-
-
-
-
-
How We Encode the Categorical Variable
-
- We code patient group as a number: Female = -1, Male = +1.
-
-
- This ±1 encoding centers the variable at zero. The benefit: β₀ becomes the overall average response
- (not just the response for one group), and the math for interpreting interactions stays clean.
- You'll see two clusters of points in the plots below—one for each group.
-
-
-
-
Let's look at the raw data. Do these groups seem to respond differently to treatment?
-
-
-
-
-
We want to build a model to predict blood pressure reduction:
- This model assumes treatment works equally well for both groups.
- The only difference between groups is a vertical shift (β₂ moves one line up or down).
-
-
-
-
-
Model Fit
-
-
-
-
Residuals
-
-
-
-
-
- Look at the residuals. If the model fit well, they would scatter randomly around zero.
- Instead, notice how the colors separate—Female residuals trend one direction while Male residuals trend the other.
- The model is systematically wrong. It's missing something.
-
-
-
-
Second Attempt: Adding an "Interaction" Term
-
-
- A colleague suggests: "We need to account for how Treatment and Group interact.
- Let's add a term that combines them: (Treatment + Group)."
-
- What do you think—will this capture the interaction effect?
-
- The "interaction" term just gets absorbed into the existing coefficients!
- It's mathematically equivalent to the first model—no new information is added.
-
-
- In matrix terms: the column for (Treatment + Group) is a linear combination of existing columns,
- making the design matrix rank-deficient.
-
-
-
-
-
The Key Insight: Multiplication
-
-
What if instead of adding Treatment and Group, we multiply them?
- Now the lines diverge! The residuals scatter randomly—no more systematic pattern by group.
- The model finally captures the different treatment effects.
-
-
-
-
Why Multiplication Works
-
The slope of Y with respect to Treatment now depends on the group:
- Multiplication creates a new piece of information that cannot be replicated
- by adjusting other coefficients. The product column is linearly independent.
-
-
-
-
-
Explore: Build Your Own Interaction
-
-
- Now it's your turn. Adjust the sliders to set the true interaction strength and noise level,
- then generate data to see how well the multiplicative model captures it.
-
- This interactive demo illustrates how regularization helps to control model complexity in polynomial regression.
- Adjust the sliders to see how the polynomial degree, regularization strength (λ), and data noise affect the model's fit.
-
-
-
-
-
-
-
-
- 15
-
-
-
-
-
- 1.0000e-7
-
-
-
-
-
- 0.5
-
-
-
-
-
- 15
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Model Fit Visualization
-
-
-
-
-
Coefficient Magnitudes
-
-
-
-
-
-
-
-
Training and Validation Error
-
-
-
-
-
-
-
Model Performance
-
-
-
-
-
-
-
-
-
-
-
Coefficient
-
Value
-
Term
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Understanding Regularization
-
-
-
-
-
Mathematical Formulation
-
-
- In polynomial regression, we fit a model of the form:
-
- The regularization parameter λ controls the strength of the penalty. Higher values enforce stronger regularization, resulting in smaller coefficients.
-
-
-
-
-
-
Key Concepts
-
-
- Why Regularization Matters:
-
-
-
Overfitting: High-degree polynomials can perfectly fit training data but generalize poorly
-
Variance-Bias Tradeoff: Regularization increases bias but reduces variance
-
Numerical Stability: Prevents exploding coefficients and improves conditioning
-
-
-
- Comparing Regularization Types:
-
-
-
Ridge (L2): Shrinks all coefficients toward zero, but rarely to exactly zero
-
Lasso (L1): Promotes sparsity by forcing some coefficients to exactly zero (feature selection)
-
No Regularization: Fits the data as closely as possible, potentially overfitting
-
-
-
- Try it: Increase the polynomial degree and observe how unregularized models start to overfit. Then increase λ to see how regularization smooths the curve and stabilizes predictions.
-
- Each round has four shuffled slot machines with different hidden payout rates. Your job is to learn quickly, update your beliefs after every pull, and then exploit the strongest machine before the round ends. The leaderboard should rank students by Skill Score, which rewards choosing strong machines across the full tournament, not just getting lucky in one round.
-
-
-
-
- Bayesian intuition
-
Every win adds evidence that a machine might be good. Every loss adds evidence that it might not be. Early on, uncertainty is wide, so a single outcome should move you a little, not all the way.
-
-
- Why the old exploit breaks
-
Arm labels reshuffle every round, and your leaderboard score tracks the quality of the hidden probabilities you chose. Repeating the same button no longer produces a strong score.
-
-
-
-
-
-
How scoring works
-
-
Round Reward is the noisy part: how many wins you actually saw in one round.
-
Skill Score is the main score: how strong your choices were across the tournament.
-
Each round lasts at most five minutes, so you need to explore early and commit before the timer locks the machines.
-
-
-
- Start by treating every machine as a 50/50 guess.
- Every win nudges your estimate up. Every loss nudges it down.
- Quick update rule: (wins + 1) / (wins + losses + 2)
- Example: 3 wins and 1 loss gives 4 / 6 = 66.7%
- Skill Score = 100 x (sum of true probabilities you chose) / (sum of best available probabilities)
-
-
-
-
-
-
-
Round
-
1 / 6
-
Tournament just started
-
-
-
Pulls Left
-
12
-
Use all 12 before the next round unlocks
-
-
-
Time Left
-
5:00
-
When the timer hits 0, the machines lock
-
-
-
Machines Tested
-
0
-
How many different machines you have tried this round
-
-
-
Round Reward
-
0
-
How many wins you got in this round
-
-
-
Tournament Reward
-
0
-
How many wins you have across all rounds so far
-
-
-
Skill Score
-
0.0
-
This is the overall leaderboard score
-
-
-
-
-
-
What To Do Now
-
- Start by sampling several machines. The wide uncertainty bands show that your prior is still broad.
-
-
-
-
-
-
-
- Submission Code
-
MAB-XXXX-XXXX
-
This identifies your tournament run. It gets copied together with your final score.
-
-
- Current Best Guess
-
No evidence yet
-
Early guesses are weak. Once uncertainty narrows, this becomes more meaningful.
-
-
-
-
-
-
-
Casino Floor
-
- Click a machine to pull it. The big number is your current posterior mean. The blue band is a rough plausible range. Wide band means "you still do not know much yet."
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Press Start Round to begin the 5:00 timer. Submit unlocks only after Round 6.
-
-
-
-
-
Round Report (This Round Only)
-
-
Finish a round to reveal the true machine probabilities, compare your posterior beliefs to reality, and see whether your exploration paid off.
-
-
-
-
-
Bayes Coach
-
-
-
-
-
-
-
-
Leaderboard
-
- Students submit only after the tournament is complete. The form should record the final Skill Score, the raw reward, and the submission code from the copied line.
-
-
- Suggested columns for the reset leaderboard: student name, skill score, raw reward, submission code, and optionally section or timestamp.
-
- The elbow method helps determine the optimal number of clusters (K) by analyzing how the
- Sum of Squared Errors (SSE) changes with different K values.
-
-
- Key Points:
-
-
-
As K increases, SSE naturally decreases
-
The "elbow" appears where adding more clusters gives diminishing returns
-
This point often indicates a good balance between model complexity and fit
-
-
- Try it: Generate new data to see how the elbow curve changes. Look for
- where the curve starts to level off - this often suggests a reasonable choice for K.
-
- Assignment Step: For each point \(\,x_i\,\), calculate \(\,d(x_i, \mu_j)^2\,\) for all \(\,j\,\) and assign \(\,x_i\,\) to the cluster with the smallest distance.
-
-
- Update Step: For each cluster \(\,j\,\), recalculate \(\,\mu_j\,\) as the mean of all points assigned to it.
-
-
- SSE Calculation: Calculate the new SSE after the update.
-
-
- Convergence Check: If SSE hasn't significantly decreased or max iterations reached, stop. Otherwise, go back to step 1.
-
-
-
Mathematical Foundations
-
A. Dataset Representation
-
- Let \(\,X = \{x_1, x_2, ..., x_n\}\) be our dataset, where each \(\,x_i\,\) is a \(\,d\)-dimensional vector.
-
-
B. Squared Euclidean Distance
-
- The distance between two points \(\,x = (x_1, ..., x_m)\) and \(\,y = (y_1, ..., y_m)\) is calculated as:
-
Isn’t the mean affected by outliers? How is it the best prediction?
A prediction is only the “best” relative to some loss function. When using the constant model, \(H(x) = h\), the mean is the best prediction only if we choose to use the squared loss function, \(L_\text{sq}(y_i, h) = (y_i - h)^2\). If we choose another loss function, like absolute loss \(L_\text{abs}(y_i, h) = \lvert y_i - h \rvert\), the mean is no longer the best prediction.
The key idea is that different loss functions lead to different “best” parameters.
“Empirical risk” is another term for “average loss for whatever loss function you’re using.” Any loss function \(L(y_i, h)\) can be used to create an empirical risk function \(R(h)\). We’ve seen two common loss function choices:
When using absolute loss, \(L_\text{abs}(y_i, h) = \lvert y_i - h\rvert\), the empirical risk, \(R_\text{abs}(y_i, h) = \frac{1}{n} \sum_{i = 1}^n \lvert y_i - h\rvert\), has a special name: “mean absolute error.”
When using squared loss, \(L_\text{sq}(y_i, h) = (y_i - h)^2\), the empirical risk, \(R_\text{sq}(y_i, h) = \frac{1}{n} \sum_{i = 1}^n (y_i - h)^2\), has a special name: “mean squared error.”
Let’s suppose we’re working with the constant model, \(H(x) = h\).
The minimizer of mean squared error is unique, because the minimizer of mean squared error for the constant model is the mean, and the mean of a collection of numbers \(y_1, y_2, ..., y_n\) is always just a single number. Specifically, it’s the number \(\frac{y_1 + y_2 + ... + y_n}{n}\).
The minimizer of mean absolute error is not necessarily unique. It’s unique when there’s an odd number of data points – specifically, if the data points are sorted in order, with \(y_1\) being the smallest and \(y_n\) being the largest, then the minimizer of mean absolute error is the median, \(y_{\frac{n+1}{2}}\). But if there are an even number of data points, then any of the infinitely many numbers on the number line between \(y_{\frac{n}{2}}\) and \(y_{\frac{n}{2} + 1}\) minimize mean absolute error, so the minimizer of mean absolute error is not necessarily unique.
For example, in the dataset 72, 90, 61, 85, 92, 75, there are an infinite number of possible predictions that minimize mean absolute error. 75 is one of them, but so is 75.001, 76, 79.913, etc – anything between 75 and 85, inclusive, minimizes mean absolute error.
What was the point of plugging in \(h^*\) into \(R(h)\)?
We spent the first week of class minimizing empirical risk, \(R(h)\). We found that, depending on our choice of loss function, \(h^*\) ended up being a different measure of the center of our dataset. The point was to show that the values of \(R(h)\) actually have some meaning as well, and in particular, the smallest possible value of \(R(h)\) (which is \(R(h^*)\)) happens to describe the spread of our dataset.
In the image above, \(h^*\) is the \(x\)-coordinate of the vertex (80 and 85). We know what 80 and 85 mean – they’re the mean and median of the dataset 72, 90, 61, 85, 92, respectively. What we were trying to give context to is what 150 and 9.9 mean – they’re the variance and the mean absolute deviation from the median of our dataset. Both the variance and mean absolute deviation from the median are measurements of spread.
Are there more loss functions outside of what we learned in class?
There are plenty! For example, there’s Huber loss, which is like a smoothed version of absolute loss (it’s absolute loss, with the corner at the bottom replaced with the bottom of a parabola). There’s also cross-entropy loss, also known as “log loss”, which is designed for models that predict probabilities (like logistic regression). These, and many more, will come up in future ML classes, like DSC 140A and CSE 158/DSC 148.
Lecture(s) to Review:
N/A
How do I know which loss function to choose in practice?
As we’ve seen, different loss functions have different properties. At least with regards to the constant model:
In practice, various models have a “default” choice of loss function. Regression usually uses squared loss, not just because squared loss is easily differentiable, but also because squared loss comes with lots of nice theoretical properties (which you’ll learn about in DSC 140A, like the fact that implicitly assumes that the distribution of errors is normal/Gaussian). But depending on your model, you can just try different loss functions and see which ends up creating the model with the best performance!
Lecture(s) to Review:
N/A
What was the point of the midrange and infinity loss? Will I actually use that in practice?
I’ve never heard of anyone using \(\lvert y_i - h\rvert^p\) with \(p \rightarrow \infty\) as a loss function in practice, so no. But the point of us studying that was for us to get a better understanding of how different loss functions penalize different kinds of errors, and in particular, how the optimal constant prediction is influenced by outliers.
Again, for the constant model \(H(x) = h\):
Absolute loss, \(\lvert y_i - h\rvert\), isn’t sensitive to outliers, it’s very robust. Remember, the minimizer (the median) was found by finding the \(h\) where (# points to the left of \(h\) = # points to the right of \(h\)).
Squared loss, \((y_i - h)^2\), is more sensitive to outliers. Remember, the minimizer (the mean) was found by finding the \(h\) where \(-\frac{2}{n} \sum_{i = 1}^n (y_i - h)= 0\), because \(-\frac{2}{n} \sum_{i = 1}^n (y_i - h)\) is the derivative of \(R_\text{sq}(h) = \frac{1}{n} \sum_{i = 1}^n (y_i - h)^2\). Since this is the case, the mean is “pulled” in the direction of the outliers, since it needs to balance the deviations.
Following the pattern, \(\lvert y_i - h\rvert^3\) would be even more sensitive to outliers.
As we keep increasing the exponent, \(\lvert y_i - h\rvert^p\) creates a prediction that’s extremely sensitive to outliers, to the point where its goal is to balance the worst case (maximum distance) from any one point. That’s where the midrange comes in – it’s in the middle of the data, so it’s not too far from any one point.
So while no, you won’t really use the idea of “infinity loss” in practice, I hope that by deeply understanding how it works, you’ll better understand how loss functions (including those we haven’t seen in class, but do exist in the real world) work and impact your predictions.
In Lecture 4, is the \(x_i\) not part of the summation since it is out of the parentheses?
The question was referring to a summation like this one:
\[\sum_{i = 1}^n (y_i - w_0 - w_1 x_i) x_i\]
Here, \(x_i\) is indeed a part of the summation. The sum is of \(n\) terms, each of which are the form \((y_i - w_0 - w_1 x_i) \cdot x_i\). That is, the summation above is equivalent to:
On the left, we have the graph of the mean squared error of a constant prediction, \(h\), on the dataset 72, 90, 61, 85, 92. It shows us that there is some best \(h\), which we’ve been calling \(h^*\), that makes the mean squared error as small as possible. We showed, using calculus, that the value of \(h^*\) for any dataset is \(\text{Mean}(y_1, y_2, ..., y_n)\).
On the right, we have the graph of mean squared error of the line \(H(x) = w_0 + w_1 x\). The dataset is the dataset of departure times and commute times we’ve been using as our running example. Specifically:
The two axes on the “ground” of the plot represent different intercepts, \(w_0\), and slopes, \(w_1\), that we could be using for making predictions.
The height of the graph above any \((w_0, w_1)\) pair is \(\frac{1}{n} \sum_{i = 1}^n (y_i - (w_0 + w_1 x_i))^2\). \(x_i\) represents the \(i\)th departure time (e.g. 8.5, corresponding to 8:30AM) and \(y_i\) represents the \(i\)th actual commute time (e.g. 75 minutes).
The point was to show what the function \(R_\text{sq}(w_0, w_1) = \frac{1}{n} \sum_{i = 1}^n (y_i - (w_0 + w_1 x_i))^2\) actually looks like, before we went to use calculus to minimize it. It kind of looks like a bowl, and has a clearly defined minimum. Calculus helped us find that minimum, which occurs at \(w_0^* = \bar{y} - w_1^* \bar{x}\) and \(w_1^* = \frac{\sum_{i = 1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i = 1}^n (x_i - \bar{x})^2}\).
So, we have to use the computer to approximate the answer. Regression with squared loss is called “least squares regression,” but regression with absolute loss is called “least absolute deviations regression.” You can learn more here!
Lecture(s) to Review:
N/A
Is there a more detailed version of the MSE proof shown in Lecture 5?
Yes. Here’s a proof of the fact that \(R_\text{sq}(w_0^*, w_1^*) = \sigma_y^2 (1 - r^2)\).
First, note that since \(\sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2\), we have that \(\sum_{i = 1}^n (x_i - \bar{x})^2 = n \sigma_x^2\). Then:
Let’s call \(C\) the product matrix between the two.
As we discussed in lecture, every entry of this resulting matrix \(C\) will be the result of the dot product of a row of \(A\) with a column of \(B\). For example, one entry of the product matrix \(C\) is formed by dotting \(\begin{bmatrix} a_{11} & a_{12} & a_{13}\end{bmatrix}\) with \(\begin{bmatrix}b_{11} \\ b_{21} \\ b_{31} \end{bmatrix}\).
This dot product is only possible if the “length” of each row in \(A\) is equal to the “height” of each column in \(B\). In our example, this dot product is defined by
Clearly, if the number of entries in the first row of \(A\) were not equal to the number of entries in the first column of \(B\), this dot product would not make sense. For example, say \(B\) only had \(2\) rows. Then, when computing the entries of our product \(C\), we would run into a situation like this:
And we could not compute the entry of \(C\), making our matrix multiplication impossible.
In essence, the multiplication of matrices occurs when the inner dimensions of A and B, columns and rows, respectively, match. If they do not, the dot product between the rows of \(A\) and the columns of \(B\) would not be possible, and we cannot create a product matrix \(C\).
What’s the relationship between spans, projections, and multiple linear regression?
Spans
The span of a set of vectors \(\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\}\) is the set of all possible linear combinations of these vectors. In other words, the span defines a subspace in \(\mathbb{R}^n\) that contains all possible combinations of the independent variables.
In the context of multiple linear regression, the span of the feature vectors represents all possible values that can be predicted using a linear combination of the feature vectors.
Projections
A projection of the observation vector \(\vec{y}\) onto the span of the feature vectors \(\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_p\}\) is any vector \(\vec{h}\) that lies in this span.
The distance between the observations and the projection of \(\vec{y}\) into the span of the feature vectors represents the error of a prediction. That is, each projection of \(\vec{y}\) into the span of the feature vectors is defined by scaling each of the feature vectors by a certain amount (\(w_1\), \(w_2\), etc.) and summing them; the distance from this linear combination of the feature vectors to the actual observed values of \(\vec{y}\) is the error of a certain prediction.
This error is written as
\(\vec{e} = \vec{y} - X\vec{w}\),
where \(X\) represents the design matrix made up of the feature vectors, and \(\vec{w}\) represents the coefficients that you are scaling the feature vectors by to obtain some projection of \(\vec{y}\) into the span of \(X\).
The orthogonal projection of \(\vec{y}\) into \(X\) is the one that minimizes the error vector (Or the distance between the predicted values of \(\vec{y}\) and the actual values of \(\vec{y}\)).
Multiple Linear Regression
Tying this all together, one can frame multiple linear regression as a projection problem; Given some set of feature vectors \(\vec{x}_1, \vec{x}_2, ... , \vec{x}_d\), and an observation vector \(\vec{y}\), what are the scalars \(w_1, w_2, ... , w_p\) that give a vector in the span of the feature vectors that is the closest to \(\vec{y}\)?
In other words, how close can we get to the observed values of \(\vec{y}\), while in the span of our feature vectors?
This framing of multiple linear regression also leads us to the normal equations
Why does the design matrix have a column of all 1s?
In linear regression, the design matrix \(X\) represents the features \(x_1, x_2, \ldots, x_d\). Each row of \(X\) corresponds to one data point, and each column corresponds to one feature. The parameter vector \(\vec{w}\), which we multiply by \(X\) to obtain our predictions \(\vec{h}\), contains the weights for each feature, including the intercept or bias term \(w_0\).
The term \(w_0\) is a constant that helps adjust the linear regression model vertically. This term is universal between predictions. In other words, regardless of the values of the other features \(x_1, x_2, \ldots, x_d\), the value of \(w_0\) will be the same. Let’s explore how this relates to our design matrix.
When the design matrix \(X\) is multiplied by the parameter vector \(\mathbf{w}\), each row of \(X\) produces a prediction \(h\) depending on the values of the features in the row. Each value in the row is multiplied by its associated weight in the parameter vector, and the resulting products are summed to form a prediction. However, we want the weight associated with \(w_0\) to output the same constant bias term no matter the values in \(X\).
To ensure this, we include a column of 1s at the beginning of the design matrix \(X\). This column represents the constant contribution of the bias term \(w_0\), and will always be multiplied by \(w_0\) when a particular observation is being used to make a prediction. In other words, regardless of the values of the features in \(X\), every prediction will have \(w_0 \cdot 1\) added to it. Let’s give a quick example of this.
Suppose we have a linear regression problem with two features. The design matrix \(X\) is:
As you can see in this example, our predictions all included the constant bias term \(w_0\), because in forming our predictions, \(w_0\) was always scaled by \(1\), the first entry in each row of our design matrix. This setup ensures that the intercept is included in the model, and does not interfere with the relationship between the other features and the prediction.
What is the projection of \(\vec{y}\) onto \(\text{span}(\vec{x})\) – is it \(w^*\) or \(w^* \vec{x}\)?
In multiple linear regression, the orthogonal projection of the vector \(\vec{y}\) onto the span of the vectors \(\{\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(n)}\}\) is expressed as:
\[\vec{h}^* = X\vec{w}^*.\]
Here, \(\vec{w}^*\) is a vector of scalar coefficients (\(w_1, w_2\), etc.), and \(X\) is the design matrix. In other words, \(\vec{w}^*\) provides the specific coefficients with which to form a linear combinations of your features to make predictions \(\vec{h}^*\).
So, to answer the question directly: \(w^* \vec{x}\) is the projection of \(\vec{y}\) onto \(\text{span}\{\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(n)}\}\), and \(w^*\) is the set of scalars used to make this projection when multiplied with \(\vec{x}\)
Do the normal equations work even when there is only one column in the matrix \(X\)?
Yes! Let’s look at two different cases where this can occur.
Case 1: \(X\) is a column of ones
If \(X\) is a column of ones, the model \(H(\vec{x}) = w_0\) fits a constant line through the data. Using the normal equations,
\[\vec{1}^T \vec{1} w_0^* = \vec{1}^T \vec{y}.\]
\(\vec{1}^T \vec{1} = n\), where \(n\) is the number of data points, and \(\vec{1}^T \mathbf{y} = \sum_{i=1}^n y_i\). Thus, the normal equations become:
\[n \cdot w_0^* = \sum_{i=1}^n y_i.\]
And, solving for \(w_0^*\), we get
\[w_0^* = \frac{1}{n} \sum_{i=1}^n y_i,\]
which is the mean of the target values.
Case 2: \(X\) has different values
Now, let’s imagine that \(X\) is a column vector with different values for each data point, representing a single feature:
When do two vectors in \(\mathbb{R}^2\) span all of \(\mathbb{R}^2\)? When do \(n\) vectors in \(\mathbb{R}^n\) span all of \(\mathbb{R}^n\)?
Two vectors in \(\mathbb{R}^2\) span all of \(\mathbb{R}^2\) when they are linearly independent (You cannot express one as a scalar multiple of the other). In other words, if \(\vec{u}\) and \(\vec{v}\) are two vectors in \(\mathbb{R}^2\), they will span all of \(\mathbb{R}^2\) if \(\vec{u}\) and \(\vec{v}\) are not collinear, or on the same line.
Similarly, \(n\) vectors in \(\mathbb{R}^n\) span all of \(\mathbb{R}^n\) when they are linearly independent. This means that no vector in the set can be expressed as a linear combination of the others.
Intuition
To span a space means to cover it entirely.
Think of two vectors in \(\mathbb{R}^2\). If one vector is a scalar multiple of the other, then they both point in the same direction or opposite directions, essentially lying on the same line. This means they can only cover that line and cannot cover any other directions.
In higher dimensions, the same principle applies. For example, in \(\mathbb{R}^3\), three linearly independent vectors point in different directions and can cover all of three-dimensional space. However, if one is a linear combination of the others, then the three vectors lie on the same plane, and can only span that plane.
What’s the relationship between spans, projections, and multiple linear regression?
Spans
The span of a set of vectors \(\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\}\) is the set of all possible linear combinations of these vectors. In other words, the span defines a subspace in \(\mathbb{R}^n\) that contains all possible combinations of the independent variables.
In the context of multiple linear regression, the span of the feature vectors represents all possible values that can be predicted using a linear combination of the feature vectors.
Projections
A projection of the observation vector \(\vec{y}\) onto the span of the feature vectors \(\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_p\}\) is any vector \(\vec{h}\) that lies in this span.
The distance between the observations and the projection of \(\vec{y}\) into the span of the feature vectors represents the error of a prediction. That is, each projection of \(\vec{y}\) into the span of the feature vectors is defined by scaling each of the feature vectors by a certain amount (\(w_1\), \(w_2\), etc.) and summing them; the distance from this linear combination of the feature vectors to the actual observed values of \(\vec{y}\) is the error of a certain prediction.
This error is written as
\(\vec{e} = \vec{y} - X\vec{w}\),
where \(X\) represents the design matrix made up of the feature vectors, and \(\vec{w}\) represents the coefficients that you are scaling the feature vectors by to obtain some projection of \(\vec{y}\) into the span of \(X\).
The orthogonal projection of \(\vec{y}\) into \(X\) is the one that minimizes the error vector (Or the distance between the predicted values of \(\vec{y}\) and the actual values of \(\vec{y}\)).
Multiple Linear Regression
Tying this all together, one can frame multiple linear regression as a projection problem; Given some set of feature vectors \(\vec{x}_1, \vec{x}_2, ... , \vec{x}_d\), and an observation vector \(\vec{y}\), what are the scalars \(w_1, w_2, ... , w_p\) that give a vector in the span of the feature vectors that is the closest to \(\vec{y}\)?
In other words, how close can we get to the observed values of \(\vec{y}\), while in the span of our feature vectors?
This framing of multiple linear regression also leads us to the normal equations
When \(X^TX\) isn’t invertible, how do we solve the normal equations?
When \(X^TX\), we cannot solve the normal equations using traditional methods. That is, if we cannot invert \(X^TX\), we cannot solve \(w = (X^\mathrm{T}X)^{-1}X^\mathrm{T}y\).
Generally, this situation arises when one of the columns of our design matrix \(X\) is a linear combination of the other columns in \(X\). This leads to an infinite amount of solutions satisfying the normal equations, and so finding a unique solution is impossible. However, if you are interested in other methods with which to solve the normal equations when \(X\) is not invertible, feel free to explore them! As a starting point, try researching the Moore-Penrose pseudo-inverse and ridge regression as two other approaches to solving for an optimal parameter vector!
A matrix is full rank when each column in the matrix is linearly independent.
In linear regression, the design matrix \(X\) must be full rank to have a unique solution for the normal equations. If \(X\) is not full rank, it implies multicollinearity among the features, which leads to an infinite amount of solutions when solving for the optimal parameters \(\vec{w}^*\). For clarity:
Full Rank: If the design matrix \(X\) is full rank, then all of its columns are linearly independent. This allows the normal equations:
\[X^T X \vec{w}^* = X^T \vec{y}\]
to have a unique solution.
Not Full Rank: If \(X\) is not full rank, then some columns of \(X\) are linear combinations of other columns. This leads to multicollinearity, which results in infinitely many solutions for the normal equations.
In multiple linear regression, is \(\vec{h}^*\) orthogonal to \(\vec{y}\)?
\(\vec{h}^*\) is the optimal hypothesis vector; That is, \(\vec{h}^* = X\vec{w}^*\). This means that \(\vec{h}^*\) is the orthogonal projection of our observation vector \(\vec{y}\) into the span of our feature vectors \(\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(d)}\). As such, \(\vec{h}^*\) is orthogonal to the error vector \(\vec{e} = \vec{y} - \vec{h}^*\). However, this relationship does not imply orthogonality with \(\vec{y}\), or any other vector aside from the error vector \(\vec{e}\)
Why does the multiple linear regression model with two features look like a plane?
When we perform multiple linear regression with two features, we take information from two independent variables and predict some value for our target variable. Let’s think about this relates to a plane, both algebraically and geometrically.
Algebraically, if our features are \(x_1\) and \(x_2\), our prediction function takes the form
\[H(\vec{x}) = w_0^* + w_1^*x_1 + w_2^*x_2,\]
which is the general formula for a plane.
Geometrically, multiple linear regression with two features is the same idea:
Each feature (\(x_1\) and \(x_2\)) corresponds to one axis, and our target variable (the variable we are trying to predict), is represented by the vertical axis. When we vary the values on the \(x_1\) and \(x_2\) axes, we are exploring the values of our prediction function when we can vary \(2\) features- this forms a \(2\)-dimensional surface.
If this question also concerns why these predictions form a plane instead of some other surface, perhaps with curves or bends, we can also briefly address that. In a linear regression model, the relationship between the input features and the target variable is linear. This means that the predicted value is a linear combination of the input features, with each feature having a fixed weight (or coefficient). Another way to say this is that in a linear model with \(2\) feature vectors, our predictions must be within the span of our feature vectors. In \(3\) dimensions, this span is a plane (This concept is addressed in lectures 5 and 6, if you want a refresher on span!)
If we had a nonlinear prediction function with \(2\) features, we could see a prediction function that forms a curved surface in \(3\) dimensions. However, as a consequence of performing linear regression, our prediction function will form a plane.
For more visual intuition of this idea, check out the first 35 seconds of this video!
Welcome to our theorem-thumping playlist—perfect for tackling 40a material and getting in the mode to do some math. Every track has been carefully derived, ensuring your productivity converges. Rest easy! Each beat is rigorously optimized by our in-house team of audiophile-maticians, so your mind can flow freely from gradient descent to your next big combinatoric breakthrough. Put on your headphones, hit play, and let these tunes transform your study sessions into a full-rank masterpiece. In this playlist, there are no null vectors—only real solutions that keep you in perfect rhythm. Enjoy, and keep integrating those good vibes!
Class Kickoff
A curated playlist from the musical mind of Jiesen! Welcome to DSC40a.
Study Session Jams
7 Hours long, just enough to get your through a full work day… minus meetings.
Tunes for Deadlines!
60min Math Crunch
diff --git a/docs/pages/staff.html b/docs/pages/staff.html
index e5c5b36..d8b5160 100644
--- a/docs/pages/staff.html
+++ b/docs/pages/staff.html
@@ -1 +1,192 @@
- 👩🏫 Staff - COGS 108👩🏫 Staff | COGS 108Skip to main contentLinkSearchMenuExpandDocument(external link)
This hands-on, practical course is intended to get you experience working on data science projects. You may have theoretical knowledge from other courses, but here we are going to implement. Doing something is rarely simple. You will likely attempt to do something, do it wrong, learn from your mistakes, and with a bit of luck and skill, eventually succeed. That’s just part of the scientific process, and data science is no exception. This course is all about the practice of data science.
In focusing on the practice, there is theory that won’t be discussed and mathematical proofs that won’t be done. That is by design. In particular:
There are entire courses dedicated to each of the topics we’ll cover. To have time to do anything, we can’t teach all the details in a single course.
Experts in each of these domains are out there and excited to teach you the nitty gritty about each topic. TAKE THOSE OPPORTUNITIES… the theory we are skipping is important and useful!
We’re promoting data literacy. We believe that everyone who is data literate is at an advantage as they go out into the modern world. Data literacy is not limited to those who are computational gurus or math prodigies. You do not have to be either of those to excel at this course.
In this course, you will try many methods. You’ll even be asked to implement a technique that has not been explicitly taught. Again, this is by design. As a data scientist, you’ll regularly be asked to step outside of your comfort zone and into something new. Our goal is to get you as comfortable as possible in that space now. We want to provide you with a technical and a data science mindset that will allow you to ask the right questions for the problem at hand and set off alarm bells when something in your dataset or analysis is “off.”
Formulate a plan for and complete a data science project from start (question) to finish (communication)
Explain and carry out descriptive, exploratory, inferential, and predictive analyses in Python
Communicate results concisely and effectively in reports and presentations
Identify and explain how to approach an unfamiliar data science task
COURSE MATERIALS
There is no textbook
Python (>= 3.9) and a recent version of Jupyter Notebooks. You may fulfill this by any of the following
Datahub
A local copy on your own computer (we recommend the Anaconda distribution)
Cloud services such as Google Colab, Microsoft Azure, etc. These all have a free tier or free hours before billing.
git and a GitHub login
If you prefer a GUI feel free to use GitHub Desktop, SourceTree, VS Code, etc.
All other course materials will be provided on GitHub and datahub
Optional Readings:
There are no required readings for this course. But this course challenges you to learn on your own. If you’re not sure where to start, the following texts may be very helpful in the course.
Frankel, A (2020) Data Science In Pratice. Lecture notes for DSC 80 which has substantial overlap with this course
Vanderplas, J (2023, 2nd ed) Python Data Science Handbook. Short and to the point. Both the text and the code are freely available on Github. Learn to use standard libraries to get things done.
Grus J (2019, 2nd ed) Data Science from Scratch. This book takes you into HOWs and WHYs, rather than just learning to use a library you don’t really understand. This is the harder book, but you will grow tremendously working though it. Can be accessed for free through your UCSD login
Final exam: This course has no final exam. Do not show up on the date/time of the final exam. Instead this course has a final project.
All grades will be released on Canvas. It is your responsibility to check that your assignment was submitted, that your grade is accurate, and to get in touch if any are missing and/or you think there is a problem.
Extra credit worth up to 2% of the final grade will be awarded for
Exceptional performance on the final report: meeting the “Excellent” category on more than 5 rubric items will get 0.5% bonus to the final grade.
Exceptional participation on EdStem discussion board: Roughly the top 20 contributors will get 0.25% bonus to their final grade. You’re more likely to make the cut if you help organize things, participate in good discussions and help answer questions, and less likely to make the cut if you just ask a lot of random questions.
Answering the pre & post course surveys will give 0.5% extra credit (0.25% each for 0.5% total)
Attending guest lecture(s) in-person is 0.25% extra credit
Filling out all 7 of the weekly project progress surveys (0.25% of grade, see Project section below)
If >2/3 of the students fill out SET teaching evaluations at the end of the quarter there will be an extra 0.5% of the final grade for everyone. BTW this EC criteria has not been met since 2022, why won’t the current generation of students actually fill out evals?
Our grading scale is
letter
range
letter
range
A+
≥ 97%
C+
< 80 % to 77%
A
< 97% to 94%
C
< 77 % to 74%
A-
< 94% to 90%
C-
< 74 % to 70%
B+
< 90 % to 87%
D
< 70 % to 60%
B
< 87 % to 84%
F
< 60 %
B-
< 84 % to 80%
Know that a third of the class typically feels overwhelmed at the start of the quarter. That said, the average is quite high in this course (typically A). So, while we anticipate you all doing well in this course, if you are feeling lost or overwhelmed, that’s ok! Should that occur, we recommend: (1) asking questions in class, (2) attending office hours and/or (3) asking for help on EdStem.
Attendance
I continue to believe that attending in-person is the best way to learn for most people. Our goal is to make the lecture and discussion section worth your while to attend through interactive content and making it a good place to get questions answered. If you can’t come to a particular class session or two that’s fine, but if you’re planning to never come I think you’re making a mistake.
In-person illness policy
Please do not attend any in-person activity (lecture/section/office hours) if you are feeling ill, especially if you are sneezing/coughing and have a fever. If you feel mildly ill but without sneezing/coughing, or if you have bad allergies, then you may come to in-person events while wearing a well-fitting mask.
LECTURE
This quarter there are two sections of lecture. For the first week please attend the section you were assigned. After that there will probably be enough free seats for you to attend the section you prefer, and I’m ok with that. I will podcast one of the two sections.
Please bring a smartphone or tablet or laptop to lecture for occaisional live polling via Google Form. Up to 1% of your grade will be given for participating in at least 20 out of the 30 or more such exercises that will be offerred. Most lectures have 1 or 2 such exercises, but some lectures may not have any.
DO NOT COME TO LECTURE IF YOU ARE ILL! You won’t impact your attendance credit if you take some time off since you can miss 1/3 of the in class exercises without loss of points.
DISCUSSION SECTION
Discussion sections will
review the previous week’s material
administer a 20 min quiz on the previous week’s material
provide help for the week’s discussion lab, assignments, or any topic
In Week 1 discussion section is a Python review; but there will be no quiz or any turn-in item that week. For subsequent weeks sections will follow the bullet points above and there will be both a quiz and help with turn-in items for the week.
If there are free seats you may attend any section you prefer, but if the seats are full and you are not registered for that section please leave so someone who is supposed to be there can participate.
Section
Time and Location
TA
A01
W 5:00p-5:50p GH 242
TBD
A02
W 11:00a-11:50a DIB 121
Jessica
A03
W 12:00p-12:50p DIB 121
Jessica
A04
W 1:00p-1:50p COA B17
Shreya
A05
W 2:00p-2:50p COA B17
Shreya
A06
W 3:00p-3:50p COA B26
Jessica
A07
W 4:00p-4:50p COA B26
Shreya
A08
W 6:00p-6:50p CSB 5
Chaitanya
B01
F 9:00a-9:50a DIB 121
TBD
B02
F 10:00a-10:50a COA B17
Zhicheng
B03
F 11:00a-11:50a COA B26
Zhicheng
B04
F 12:00p-12:50p COA B17
Zhicheng
B05
F 1:00p-1:50p PCYNH 120
TBD
B06
F 4:00p-4:50p CSB 5
Chaitanya
B07
F 3:00p-3:50p CSB 5
TBD
B08
F 5:00p-5:50p CSB 5
Chaitanya
Discussion section lab exercises
Each week there will be a short lab exercise to review material from lecture and give hands-on programming experience. You can work on these exercises in your lab sections, and your section leaders will be happy to help guide you towards the correct answers. Or if you don’t need help you can complete them on your own. You may seek help from your classmates or chat about problems in a group, but please realize that this is your time to practice and learn new skills with low stakes. If someone else does it for you its hard to practice and learn!
Please see the course policy on AI assistance and follow it when working on lab exercises.
Lab exercises are available to you on Datahub, and also turned in there. Lab exercises will be due on Fridays 11:59PM and released one week before they are due. If lab exercises are due on a holiday then they will be accepted until the following non-holiday day that week.
ASSIGNMENTS
If lab exercises are low stakes and you can get lots of help, then assignments are high stakes and you can only get minimal help. They will be completed individually in Jupyter Notebooks which are released and submitted on datahub. Each assignments will cover the material in the previous two lab exercises.
Please see the course policy on AI assistance and follow it when working on assignments.
The practice of data science involves writing code to answer questions and accomplish tasks. Thus, to get practice, your assignments will require you to use Python to do just that. Not everything will be explicitly mapped out step-by-step for you. This is intentional. Figuring things out when it’s not entirely clear what to do next is part of the practice here. You’ll attempt things that won’t work and become comfortable with this. You’ll get stuck and work to get unstuck. Not quite knowing exactly what’s going on at all times is part of the process. And, to be honest, part of the job of being a data scientist.
As assignments become more difficult, we don’t want you to feel totally lost. If you’ve thought long and hard, gone down a long rabbithole on Stack Overflow, and can’t even get a sense of what the next step may be, take a step away. Take a break. Then, come back and see if you can’t solve it with a refreshed mind. If you’re still totally stuck, ask on EdStem, talk to a classmate, and/or attend office hours for help.
But how should you help one another on assignments? You should: ask a question that leads the student to figure it out for themselves, point out the correct principle/theory that applies in this case, provide a link to an explanation, or a chunk of pseudo-code. You should not: provide the full answer or code they can just copy/paste.
Datahub: Assignments and labs
Assignments and labs will be submitted individually on datahub. They be released a week before the assignment due date.
It is very easy to mess up datahub submission because the UI is trash. YOU are SOLELY RESPONSIBLE for using it correctly. You will not get points back if you mess things up:
forget to save your work and submit work with missing answers… you will lose those points
forget to follow instructions for exercises on GitHub or fail to follow those instructions completely to the letter… you will lose those points
forget to press the “submit” button after you are done… its a zero
somehow corrupt a file, delete a file, etc… its a zero
press the wrong submit button (e.g., submit A3 a second time instead of submiting A4)… its a zero for A4
pressing the wrong submit button could end up getting you a late penalty… losing you 25% of your A3 grade in the example above
we do not have the history of your submissions, all we can see is your last submission, we can’t go back and grade the first one
so if a second submission comes in after deadline but before grading it will be counted as late
There will be at least a dozen people each quarter who tell say “I’m so sorry there was a mistake when I clicked submit! Can I please get it graded without late penalty?” And my answer will be no. Do not be that person!
You will receive individualized feedback via email with your grade and feedback about a week after each assignment is due.
Late policy on assignments and labs
Assignments and labs will be accepted up to 5 days late with a 25% late penalty taken off your grade. Every student receives 7 free late days with NO DEDUCTION for use on any assignment or lab exercise. Without penalty, you can turn in 7 work items one day late each, or 1 work item 3 days late and another 4 days late, or any other combination.
PLEASE NOTE while your pool of late days is 7, your maximum lateness on any single item is 5 days. PLEASE NOTE these late days are intended for use when you are behind and need an extra day, illness, family emergencies, mental health crises, etc.. You will not be given more late days. You will not be given more than 5 days to get something submitted. If you cannot manage to turn in assignments within this system you need to talk with your dean because something has gone drastically wrong that cannot be handled in my class.
Regrades on assignments and labs
We will work hard to grade everyone fairly and return assignments quickly. But, we know you also work hard and want you to receive the grade you’ve earned. Occasionally, grading mistakes do happen, and it’s important to us to correct them.
If you think there is a mistake in your grade, request a regrade within 72 hours of your receipt of the grade on EdStem, using the “Regrade requests” folder. This message should include evidence of why you think your answer was correct (i.e. a specific reference to something said in lecture) and should point to the specific part of the assignment in question.
Note that points will not be rewarded if you fail to follow instructions. For example, if the instructions say to name the variable orange and you name it ornage (misspelled), you will not be rewarded credit upon regrade. This is because (1) following instructions and being detail-oriented is important and (2) there are hundreds of students taking the course this quarter. It would be an unfair burden to place on TAs if we didn’t have this policy.
COURSE PROJECT
Your course project will be completed in a group of 4-5 people. The reality of data science is that you will have to work with others. You’ll need to work together to communicate effectively, manage time, organize your projects, and accomplish a goal. People will have different knowledge and skills sets. It is your job as a group to work together to figure out how to maximize each group member’s skills to make sure that your differences are helpful to accomplishing your goal, rather than a hindrance. For example, some of you will find the programming aspects of the class assignments very easy, while others will struggle. Alternatively, some of you may find experimental research and hypothesis testing intuitive, while others find it confusing and frustrating. It is best for your project if you choose a team with a mix of background and experience.
Finding A Group
Groups can be found in a few different ways:
If you have people in the class you know you want to work with, chat with one another and if you’re all on board, form a group.
There will be time to find groups in discussion section.
If you don’t know people in the class or don’t have people you want to work with, no problem. Seek them out on EdStem!
You will submit who your group is via Google Form by the Wednesday of week 3 (see Course Schedule). One form will be submitted per group.
If you do not signup for a group you will be randomly assigned a group. However you generally don’t want that, you’ll have more fun if you get on board with people who want to work on the same thing as you. Also in my experience the randomly assigned groups have more trouble than the ones students put together on their own.
Probably the most important way to start a team off on the right foot is to discuss expectations of how you will work together. How will you divide up the work? How often and where will you meet? How will you communicate with each other? What’s the maximum amount of time it should take for someone to respond to a message? How will you double check that things that need to happen did happen? If there is a problem meeting a deadline how should the person responsible let others know, and then how should the rest of the team react? There are many possible answers to these questions, and there are many more questions about expectations that you might want to ask yourselves. Your team should decide what’s right for you, and then write down these expectations in a place that the team will often see it. Trust me, this will help things go more smoothly.
Project Components
These project components are completed and submitted as a group and are described in the Project documentation: https://github.com/COGS108/Projects. This includes: 1) Previous Project Review 2) Project Proposal 3) Project Checkpoint #1: Data, 4) Project Checkpoint #2: EDA, 5) Final Project Report, and 6) Final Project Video
There are also components of the project that must be completed individually. Starting week 3, there will be an optional weekly survey to be completed individually describing your project progress. Students who complete all 7 weeks’ surveys will earn 0.5% extra credit to their final grade. At the end of the course there is a team evaluation survey… this is your opportunity to let us know if a teammate did not contribute
COURSE SCHEDULE
Date
Week
Day
Lecture topic
Due today
Section covers
Mar-30
1
M
Welcome!
Apr-01
1
W
Version Control I
python review (nothing to turn in)
Apr-03
1
F
Version Control II
python review (nothing to turn in)
Apr-06
2
M
Data & Intuition I
Apr-08
2
W
Data & Intuition II
Practice assignment, pre-course survey
Q1 and D1 (git and conda)
Apr-10
2
F
Data Wrangling
D1
Q1 and D1 (git and conda)
Apr-13
3
M
Ethics
Apr-15
3
W
Data Science questions
GitHub ID, Group signup
Q2 and D2 (data)
Apr-17
3
F
Data viz principles
D2
Q2 and D2 (data)
Apr-20
4
M
Exploratory Data Analysis
Apr-22
4
W
Geospatial
A1, Project Review*
Q3 and D3 (viz / geospatial)
Apr-24
4
F
Inference I
D3
Q3 and D3 (viz / geospatial)
Apr-27
5
M
Inference II
Apr-29
5
W
Non parametric inference
Project Proposal*
Q4 and D4 (inference)
May-01
5
F
Linear models I
D4
Q4 and D4 (inference)
May-04
6
M
Linear models II
May-06
6
W
Dimensionality Reduction
A2
Q5 and D5 (linear models)
May-08
6
F
Machine Learning I
D5
Q5 and D5 (linear models)
May-11
7
M
Machine Learning II
May-13
7
W
Machine Learning III
Checkpoint #1: Data*
Q6 and D6 (DR / ML )
May-15
7
F
Machine Learning IV
D6
Q6 and D6 (DR / ML)
May-18
8
M
NLP I
May-20
8
W
NLP II
A3
Q7 and D7 (ML / NLP)
May-22
8
F
How to be wrong I
D7
Q7 and D7 (ML / NLP)
May-25
9
M
NO LECTURE - Memorial Day
May-27
9
W
How to be wrong II
Checkpoint #2: EDA*
Q8 and D8 (how to be wrong)
May-29
9
F
How to be wrong III
D8
Q8 and D8 (how to be wrong)
Jun-01
10
M
How to be wrong IV
Jun-03
10
W
Guest lecture / catch up
A4
Project help - no lab or quiz
Jun-05
10
F
Your future
Project help - no lab or quiz
Jun-08
Finals
M
Jun-10
Finals
W
Final project*, video*, team eval survey, post-course survey
Jun-12
Finals
F
* indicates group submission. All other assignments/quizzes/surveys are completed & submitted individually.
This class will be a welcoming, inclusive, and harassment-free experience for everyone, regardless of gender, gender identity and expression, age, sexual orientation, disability, physical appearance, body size, race, ethnicity, religion (or lack thereof), political beliefs/leanings, or technology choices.
At all times, you should be considerate and respectful. Always refrain from demeaning, discriminatory, or harassing behavior and speech. Last of all, take care of each other.
If you have a concern, please speak with anyone on the instruction team (professor, TAs, or IAs). If you are uncomfortable doing so, that’s ok! The OPHD (Office for the Prevention of Sexual Harassment and Discrimination) and CARE (confidential advocacy and education office for sexual violence and gender-based violence) are wonderful resources on campus.
You will work together on projects. You should help one another learn in general. Assignments and discussion labs should be completed individually, although you may seek help from your fellow students. However you may not give answers to each other at any time. In discussion labs its ok to be much more leading and provide more of the answer. In assignments you should be more careful about how much detail you are providing in your help to others.
Examples of good collaboration on assignments:
Student posts non-working code to get help, others send a link to a good reference page in sklearn documentation, or point out the generic kind of mistake being made (e.g., you’ve messed up the order of operations). Nobody just writes the correct code for the student.
Student posts a question about a theory or concept. If its not directly related to an assignment question you can choose to answer in full. However, it’s generally more helpful for learning if you use the Socratic method: ask the student questions that lead them to find the answer themselves. Also doing this helps you cement your own knowledge of the subject!
Student posts a question about an assignment problem. Others point out important principles that we have learned in class that can be used to solve it. They describe the important points, or mention important pitfalls to avoid. References to book pages, lecture slides, or lecture video times are helpful. Nobody posts the correct answer for the student.
For group projects, you will work together but every person in the group is expected to understand every aspect of the project. People may be asked to individually explain any aspect of the project and your grade may be reduced compared to the rest of the group if you are unable to do so. Projects may include ideas and code from other sources, but these other sources must be documented with clear attribution.
Cheating and plagiarism have been and will be strongly penalized.
Policy on generative AI
AI tools may be used freely in this course provided their use is transparent and properly cited. All AI contributions must be credited just as any other source would be cited. Students are fully responsible for the accuracy, originality, and quality of all submitted work. This expectation is in place to ensure that all submitted work reflects the student’s own learning and skills. Suspected submission of AI-generated content without citation or presenting it as entirely one’s own work will be reported to the Academic Integrity Office in accordance with university policy.
AI should be used as a tutor and teammate. It should not be used to do your work for you. More assistance from AI is acceptable on projects, less on labs, and very little and only conceptual help should be used on assignments.
When used for writing, it works best if you talk to it about ideas and then pick your own words, then use it again to edit your draft, then make sure you read and make final edits.
When used for programming, as a beginner talk to AI about ideas for solving the problem, ask it to explain concepts and syntax and functions. Then write the code yourself. Ask AI to help you understand errors, but you need to struggle with solving it yourself in order to learn. Ask AI to generate practice problems for you to solve.
When used for programming as an expert, then you write a design document of how to solve the problem and allow AI to work on parts of the design document. AI generate and hand validate unit tests. Always read the code AI generated, edit it as neceesary. Ask AI to critique the implementation and explain any errors.
Disability Access
Students requesting accommodations due to a disability must provide a current Authorization for Accommodation (AFA) letter. These letters are issued by the Office for Students with Disabilities (OSD), which is located in University Center 202 behind Center Hall. Please make arrangements to contact Prof Ellis privately to arrange accommodations. If you are struggling to get a meeting with OSD, you can let the professor know and they may be able to help accommodate while you work to get official documentation. Contacting the OSD can help you further: 858.534.4382 (phone) osd@ucsd.edu (email) http://disabilities.ucsd.edu
Difficult life situations
Sometimes life outside of academia can be difficult. Please email me or come to office hours if stuff outside the classroom prevents you from doing well inside it. I can often refer you on to the help you need.
If you don’t have the most essential resources required to thrive as a student, please contact UCSD Basic Needs who can help you access nutritious food and stable housing, and help you seek the means to reach financial wellness.
If you need emergency food, finances, and/or academic and social support you can also contact UCSD Mutual Aid. They provide mentoring and aid that comes from volunteers among your peers. If you don’t need that kind of support, consider joining them in helping your fellow classmates who do.
If you need counseling or if you are in a mental crisis you can contact CAPS. They provide psychiatric services, workshops, and counseling; they also operate a 24/7 crisis hotline at 858.534.3755
Letters of recommendation
TL;DR If you want a letter I advise you to work as an IA or RA or some other capacity with more than one professor.
If you are bound for graduate school you will probably need letters from 2 or 3 people. Many students ask professors they have taken a class from, but I want you to know this is a bad idea. You really want to get a letter from someone you have worked with more closely, who knows your talents and personality from experience working together with you over a long time. You are one of hundreds of students I will have this quarter. TBH with relatively few exceptions I will not know you well. To get a solid letter from a prof, you want to be a research assistant, independent study, honors thesis advisee, or instructional assistant. If you are interested in grad school you need to have done the work before the beginning of senior year; letters are needed in Nov/Dec of your senior year for the following Fall admission.
If you are just one of the hundreds of students in a class, then most profs (including me) probably won’t write you a letter. If they do, at best you will get a letter that is roughly “STUDENT was in my CLASS where they got an A+”. Every graduate school admission committee in the world can read between the lines and see that this is not a good recommendation. Any kind of selective program will probably take a hard pass given a letter like that.
If you are an assistant (research/instructional/etc) or an honors advisee then your mentor will definitely write you a letter. And the letter you will go much much harder and into great detail about your qualities. And that kind of detailed, informative letter is what graduate admissions committees want to see.
What if you don’t have the right history to get a detailed letter from a professor? I think that a detailed letter from another source is probably better for admissions than a minimum letter from a prof. Ask your manager from when you did an internship related to your chosen field, the head of a volunteer organization you work with, a grad student you worked for on a research project, etc. Someone that has been to grad school is best, but even someone without grad experience who knows you well is a potentially useful letter.
WARNING: I have had to scale back my minimum letter writing for students in my class. You can ask, I will probably say no. Because of other time commitments I will only write minimum letters for a handful of students per quarter who were in the top 10% of the grade distribution. And because this class is so easy, don’t assume just because you have an A+ that you are in that top 10%.
What should you call me?
Most students call me Professor Fleischer or Dr. Fleischer. My last name is pronounced “fly-sure”. My pronouns are he/him.
If you’re into old skool slam dunks you can call me Dr. J. I’m also perfectly happy if you call me Jason, but not all professors are OK with that kind of informality. Please do not address me as Mr. Fleischer; if you’re going to use an honorific please use the one that people expect in the situation.
What I should call you
I should call you by your preferred name, with the correct pronunciation and any honorific or pronouns you choose. Please correct me if I ever make a mistake.
How to Get Your Question(s) Answered and/or Provide Feedback
It’s great that we have so many ways to communicate, but it can get tricky to figure out who to contact or where your question belongs or when to expect a response. These guidelines are to help you get your question answered as quickly as possible and to ensure that we’re able to get to everyone’s questions.
That said, to ensure that we’re respecting their time, TAs and IAs have been instructed they’re only obligated to answer questions between normal working hours (M-F 9am-5pm). However, I know that’s not when you may be doing your work. So, please feel free to post whenever is best for you while knowing that if you post late at night or on a weekend, you may not get a response until the next day. As such, do your best not to wait until the last minute to ask a question.
If you have…
Questions about course content: these are awesome! We want everyone to see them and have their questions answered too… so post these to the EdStem discussion board!
A technical assignment question: Come to office hours, because answering technical questions is often best accomplished in person where we can discuss the question and talk through ideas. However, if that is not possible, post your question to the discussion board. Be as specific as you can in the question you ask. And, for those answering, help your classmates as much as you can without just giving the answer. Help guide them, point them in a direction, provide pseudo code, but do not provide code that answers assignment questions.
Been stuck on something for a while (>30min) and aren’t even really sure where to start: Programming can be frustrating and it may not always be obvious what is going wrong or why something isn’t working. That’s ok - we’ve all been there! IF you are stuck, you can and should reach out for help, even if you aren’t exactly sure what your specific question is. To determine when to reach out, consider the 2-hour rule. This rule states that if you are stuck, work on that problem for an hour. Then, take a 30 minute break and do something else. When you come back after your break, try for another 30 minutes or so to solve your problem. If you are still completely stuck, stop and contact us (office hours, post on EdStem). If you don’t have a specific question, include the information you have (what you’re stuck on, the code you’ve been trying that hasn’t been happening, and/or the error messages you’ve been getting).
Questions about course logistics: First, check the syllabus. If the answer is not there, ask a classmate. If you still are unsure, post on EdStem
Questions about a grade: Post a note to instructors on EdStem and select the ‘regrade requests’ tag. Include specifics as to why you feel you mistakenly/unfairly lost points in that post.
Something super cool to share related to class: feel free to post on EdStem (‘social’ tag), email the prof, or come to office hours. Be sure to include COGS108 in the email subject line and your full name in your message.
Something you want to talk about in-depth: meet in person during office hours or schedule a time to meet by email. Be sure to include COGS 108 in the email subject line.
Some feedback about the course you want to share anonymously: If you’ve been offended by an example in class, really liked or disliked a lesson, or wish there were something covered in class that wasn’t but would rather not share this publicly, etc., please fill out the anonymous Google Form
Acknowledgements
Thank you to Kyle Shannon for allowing us to use his course website as a template for this course website.
+ This hands-on, practical course is intended to get you experience working on data science projects...
+
+
+
+
There are entire courses dedicated to each topic
+
Experts exist to teach deeper theory
+
We promote data literacy
+
+
+
COURSE STAFF
+
+
+
+
+
+
Role
+
Name
+
Section
+
Office Hours
+
Contact
+
+
+
+
+
+
Instructor
+
Jason Fleischer
+
—
+
Signup link
+
Email
+
+
+
+
+
+
COURSE OBJECTIVES
+
+
+
Complete a full data science project
+
Perform analyses in Python
+
Communicate results effectively
+
+
+
GRADING
+
+
+
+
+
+
+
number
+
each
+
total
+
+
+
+
+
+
Final exam: none, replaced by project
+
+
LECTURE
+
Two lecture sections, attendance encouraged.
+
+
DISCUSSION SECTION
+
+
Review material
+
Quiz
+
Help
+
+
+
ASSIGNMENTS
+
High-stakes, individual work.
+
+
COURSE PROJECT
+
Group-based, multiple checkpoints.
+
+
COURSE SCHEDULE
+
+
+
+
+
+
Date
+
Week
+
Day
+
Topic
+
Due
+
Section
+
+
+
+
+
+
OTHER GOOD STUFF
+
+
Class Conduct
+
Respectful and inclusive environment required.
+
+
Academic Integrity
+
Do not cheat. Collaboration rules apply.
+
+
Policy on generative AI
+
Allowed if cited. Must not replace your learning.
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/resources/grokking-bayes.html b/docs/resources/grokking-bayes.html
deleted file mode 100644
index 1566fb4..0000000
--- a/docs/resources/grokking-bayes.html
+++ /dev/null
@@ -1,511 +0,0 @@
-
-
-
-
- Grokking Bayes' Theorem
-
-
-
-
-
-
-
-
-
Grokking Bayes' Theorem
-
-
This guide walks through a practical example that illustrates Bayes' theorem using a college major scenario. Let's explore how our intuition can sometimes lead us astray and how Bayes' theorem helps us reason correctly about probabilities.
-
-
The Problem Statement
-
-
Consider the following description of a person named Ahmed:
-
-
-
Ahmed is outgoing and confident.
-
-
-
-
Question 1: Based solely on this description, which seems more likely?
-
-
Ahmed is a communications major
-
Ahmed is a STEM major
-
-
Take a moment to think about your intuitive answer before proceeding.
-
-
-
Examining Our Intuition
-
-
Many people intuitively choose "communications major" because the description seems to match stereotypical traits we associate with students in communications programs. This is using what psychologists call the representativeness heuristic - judging probability by how well something matches our mental prototype.
-
-
-
Question 2: What critical information are we missing when we make this judgment if we were to disregard his description?
-
-
-
- Click for answer
-
We're missing the base rates - how common communications majors and STEM majors are in the college population. This is crucial information for making an accurate probability assessment.
-
-
-
Adding Base Rates
-
-
In many colleges and universities, STEM majors significantly outnumber communications majors:
-
-
Approximately 15% of students are communications majors
-
Approximately 85% of students are STEM majors
-
-
-
This means STEM majors are roughly 5-6 times more common than communications majors in the overall student population.
-
-
-
Question 3: How should this information affect our probability estimate?
-
-
-
- Click for answer
-
This drastically changes the calculation. Even if the description matches communications majors better, we need to account for the fact that we're much more likely to randomly select a STEM major than a communications major from the student population.
-
-
-
Visualizing the Problem
-
-
Let's represent our student population as a grid where:
-
-
Communications majors are represented in orange (a smaller portion on the left)
-
STEM majors are represented in blue (the much larger remaining portion)
-
-
-
-
-
-
-
-
-
- The Communications vs. STEM Major Problem
-
-
- Base Rates in College Population
-
-
-
-
-
- 30
- Communications (15%)
-
-
-
- 170
- STEM (85%)
-
-
-
-
- 👤
- 👤
- 👤
- 👤
- 👤
-
-
- 👤
- 👤
- 👤
- 👤
- 👤
-
-
-
-
- Students Matching "Outgoing and Confident" Description
-
-
-
-
-
-
-
- 70%
-
-
-
-
-
- 30%
-
-
-
- 21
- 51
-
-
-
-
- P(Comm given description) =
- 21 / (21 + 51) ≈ 29.2%
-
-
-
- Students Who Match the Description
-
-
-
-
-
- 21
-
-
-
- 51
-
-
-
- Even though a higher percentage of Communications majors match the description (70% vs 30%),
-
-
- there are still more STEM majors who match because STEM majors are much more common.
-
-
-
-
-
-
Question 4: If we were to randomly select a student from this population, what is the probability they would be a communications major?
-
-
-
- Click for answer
-
About 15%. In our simplified visualization, we're showing 30 communications majors out of a total of 200 students (30 communications + 170 STEM).
-
-
-
Accounting for the Description
-
-
Now let's consider how well the description matches each major:
-
-
Suppose about 70% of communications majors are outgoing and confident (21 out of our 30 communications majors)
-
Suppose only about 30% of STEM majors are outgoing and confident (51 out of our 170 STEM majors)
-
-
-
We can visualize this by highlighting the portion of each group that matches the description, as shown in the middle section of our visualization.
-
-
-
Question 5: Even though a higher percentage of communications majors match the description, why might there still be more STEM majors who match it?
-
-
-
- Click for answer
-
Because STEM majors greatly outnumber communications majors in the total population. In this case, even though only 30% of STEM majors match the description compared to 70% of communications majors, the absolute number is still larger: 51 STEM majors vs. 21 communications majors.
-
-
-
Calculating with Bayes' Theorem
-
-
Let's use the numbers from our visualization:
-
-
We have 30 communications majors in our population
-
We have 170 STEM majors
-
70% of communications majors (21 people) match the description
-
30% of STEM majors (51 people) match the description
-
-
-
-
Question 6: Among students matching the description, how many are communications majors and how many are STEM majors?
-
-
-
- Click for answer
-
21 communications majors and 51 STEM majors match the description.
-
-
-
-
Question 7: What is the probability that someone matching the description is a communications major?
Even though the description matches communications majors at a higher rate, the probability is still only about 29% that Ahmed is a communications major.
-
-
-
As shown in the visualization, even though the description is more representative of communications majors, there are still many more STEM majors who match it simply because STEM majors are so much more common.
-
-
Formalizing with Bayes' Theorem
-
-
Bayes' theorem provides the mathematical framework for this kind of reasoning:
-
-
-
$$P(A|B) = \frac{P(B|A) \times P(A)}{P(B)}$$
-
-
-
Where:
-
-
$$P(A|B)$$ is the probability of A given B has occurred (posterior)
-
$$P(B|A)$$ is the probability of B given A (likelihood)
The probability increases, but it's still more likely that Ahmed is a STEM major.
-
-
-
The Heart of Bayes' Theorem
-
-
The fundamental insight of Bayesian reasoning is that we need to consider both:
-
-
How likely each hypothesis is to begin with (prior probabilities)
-
How well the evidence fits each hypothesis (likelihoods)
-
-
-
-
-
-
-
-
- The Heart of Bayes' Theorem
-
-
-
- All possibilities
-
-
-
-
-
-
-
-
-
-
-
- 🎤
- 🔬
-
-
-
-
- All possibilities
- fitting the description
-
-
-
-
-
-
-
-
-
-
-
-
-
- 🎤
- 🔬
-
-
-
-
- P
- (
- Comm
- given
- description
- )
-
-
-
-
-
-
-
-
-
-
-
-
- +
-
-
-
-
-
- If this line of reasoning, where seeing new evidence restricts the space of possibilities, makes
-
-
- sense to you, then congratulations! You understand the heart of Bayes' theorem.
-
-
-
-
-
As shown in the visualization:
-
-
The left panel shows all possibilities with their original distribution
-
The middle panel shows how the evidence restricts the space of possibilities
-
The right panel shows how we calculate the final probability within that restricted space
-
-
-
-
Question 9: Why is it incorrect to only consider how well the description matches each major?
-
-
-
- Click for answer
-
Because we'd be ignoring the base rates - how common each major is in the student population. This leads to the base rate fallacy, where we overemphasize the matching characteristics and undervalue the prior probabilities.
-
-
-
Real-World Applications
-
-
Bayes' theorem is crucial in many real-world scenarios:
-
-
-
Question 10: How might this type of reasoning be relevant when interpreting personality test results?
-
-
-
- Click for answer
-
When a personality test suggests someone has traits often associated with a certain profession or personality type, we should consider not just how well the traits match the stereotype, but also how common that profession or personality type is in the general population.
-
-
-
-
Question 11: How could Bayesian reasoning help with making predictions about student success in different programs?
-
-
-
- Click for answer
-
When trying to predict which program a student might succeed in based on their traits, we should consider both how well their traits match successful students in each program AND how many students succeed in each program overall. A program with higher overall success rates might be a better bet even if another program seems to match their traits slightly better.
-
-
-
Conclusion
-
-
The communications/STEM major example highlights a common error in probabilistic reasoning - ignoring base rates. Bayes' theorem provides a formal framework for combining prior knowledge with new evidence to reach more accurate conclusions.
-
-
Remember:
-
-
Intuition often focuses on how well evidence matches our hypotheses
-
Proper Bayesian reasoning requires also considering how common each hypothesis is to begin with
-
When base rates are extreme, they can outweigh even strong evidence to the contrary
-
-
-
-
Final Question: Can you think of a situation in your own life where you might have fallen prey to the base rate fallacy? How could you apply Bayesian reasoning to avoid this error in the future?
In the table below, you can find lecture videos created by Janine Tiefenbruck, who created this course and taught it many times. The lecture videos linked below will generally be pretty similar in content coverage to our lectures, but there are indeed differences in notation and order. You are responsible for everything covered in OUR lectures, even if something doesn’t appear in the videos below. When in doubt, refer to the main lecture slides & videos posted in Home and ask questions on Piazza.
These are from a different course for a different audience, and use different notation and terminology. However, the high-level ideas are similar to those in the first few weeks of our course.
Unlike the first half of the course, where we had course notes written specifically for this class, we don’t have COGS 108-specific notes for the second half of the class, because there are many high-quality resources available online that cover the same material. Below, you’ll find links to some of these resources.
Readings and Sources of Practice Problems
Open Intro Statistics: Sections 2.1, 2.3, and 2.4 cover the probability we are learning in this course at a good level for undergraduates. This is a good substitute for a textbook, similar to the course notes that we had for the first part of the course. It goes through the definitions, terminology, probability rules, and how to use them. It’s succinct and highlights the most important things.
Probability for Data Science: Chapters 1 and 2 of this book have a lot of good examples demonstrating some standard problem-solving techniques. This book should be primarily useful for more problems to practice and learn from. This book is written at a good level for students in this class. It is used at UC Berkeley in their Probability for Data Science course. Our course only really covers material from the first two chapters, but if you want to extend your learning of probability as it applies to data science, this is a good book to help you do that.
Theory Meets Data: Chapters 1 and 2 of this book cover similar content to Chapters 1 and 2 of the Probability for Data Science book, but with different prose and examples. It is used at UC Berkeley for a more introductory Probability for Data Science course.
Grinstead and Snell’s Introduction to Probability: Chapters 1, 3, and 4.1 of this book cover the material from our class. This book is a lot longer and more detailed than the others, and it uses more formal mathematical notation. It should give you a very thorough understanding of probability and combinatorics, but it is a lot more detailed, so the more abbreviated resources above will likely be more useful. With that said, this book is written at a good level for undergraduates and is used in other undergraduate probability classes at UCSD, such as CSE 103.
Introduction to Mathematical Thinking: This course covers topics in discrete math, some of which are relevant to us (in particular, set theory and counting). In addition to the lecture videos linked on the homepage, you may want to look at the notes section.
Khan Academy: Counting, Permutations, and Combinations: Khan Academy has a good unit called Counting, Permutations, and Combinations that should be pretty helpful for the combinatorics we are learning in this class. A useful aspect of it is the practice questions that combine permutations and combinations. Most students find that the hardest part of these counting problems is knowing when to use permutations and when to use combinations. These practice questions have them mixed together, so you really get practice learning which is the right technique to apply to which situation.
+
+
\ No newline at end of file
diff --git a/pages/faqs.md b/pages/faqs.md
deleted file mode 100644
index 3c0b29a..0000000
--- a/pages/faqs.md
+++ /dev/null
@@ -1,658 +0,0 @@
----
-layout: page
-title: 🙋 FAQs
-description: Answers to frequently asked questions each week.
-nav_order: 7
----
-
-# 🙋FAQs (new)
-
-
-
-
-Jump to:
-
-- [Loss Functions & Empirical Risk](#loss-functions--empirical-risk)
-- [Simple Linear Regression](#simple-linear-regression)
-- [Linear Algebra](#linear-algebra)
-- [Multiple Linear Regression](#multiple-linear-regression)
-
----
-
-## Loss Functions & Empirical Risk
-
-### Isn't the mean affected by outliers? How is it the best prediction?
-
-A prediction is only the "best" relative to some loss function. When using the constant model, $$H(x) = h$$, the mean is the best prediction only if we choose to use the squared loss function, $$L_\text{sq}(y_i, h) = (y_i - h)^2$$. If we choose another loss function, like absolute loss $$L_\text{abs}(y_i, h) = \lvert y_i - h \rvert$$, the mean is no longer the best prediction.
-
-The key idea is that different loss functions lead to different "best" parameters.
-
-#### Lecture(s) to Review:
-
-- [Lecture 2](https://dsc40a.com/resources/lectures/lec02/lec02-filled.pdf)
-
-- [Lecture 3](https://dsc40a.com/resources/lectures/lec03/lec03-filled.pdf)
-
-### Does empirical risk = mean squared error?
-
-"Empirical risk" is another term for "average loss for whatever loss function you're using." Any loss function $$L(y_i, h)$$ can be used to create an empirical risk function $$R(h)$$. We've seen two common loss function choices:
-
-- When using absolute loss, $$L_\text{abs}(y_i, h) = \lvert y_i - h\rvert$$, the empirical risk, $$R_\text{abs}(y_i, h) = \frac{1}{n} \sum_{i = 1}^n \lvert y_i - h\rvert$$, has a special name: "mean absolute error."
-- When using squared loss, $$L_\text{sq}(y_i, h) = (y_i - h)^2$$, the empirical risk, $$R_\text{sq}(y_i, h) = \frac{1}{n} \sum_{i = 1}^n (y_i - h)^2$$, has a special name: "mean squared error."
-
-#### Lecture(s) to Review:
-
-- [Lecture 2](https://dsc40a.com/resources/lectures/lec02/lec02-filled.pdf)
-
-### What does it mean for a minimizer to be unique?
-
-Let's suppose we're working with the constant model, $$H(x) = h$$.
-
-The minimizer of mean squared error is unique, because the minimizer of mean squared error for the constant model is the mean, and the mean of a collection of numbers $$y_1, y_2, ..., y_n$$ is always just a single number. Specifically, it's the number $$\frac{y_1 + y_2 + ... + y_n}{n}$$.
-
-The minimizer of mean absolute error is not necessarily unique. It's unique when there's an odd number of data points – specifically, if the data points are sorted in order, with $$y_1$$ being the smallest and $$y_n$$ being the largest, then the minimizer of mean absolute error is the median, $$y_{\frac{n+1}{2}}$$. But if there are an even number of data points, then any of the infinitely many numbers on the number line between $$y_{\frac{n}{2}}$$ and $$y_{\frac{n}{2} + 1}$$ minimize mean absolute error, so the minimizer of mean absolute error is not necessarily unique.
-
-For example, in the dataset 72, 90, 61, 85, 92, 75, there are an infinite number of possible predictions that minimize mean absolute error. 75 is one of them, but so is 75.001, 76, 79.913, etc – anything between 75 and 85, inclusive, minimizes mean absolute error.
-
-
-
-
-
-
-
-#### Lecture(s) to Review:
-
-- [Lecture 2](https://dsc40a.com/resources/lectures/lec02/lec02-filled.pdf)
-
-### What was the point of plugging in $$h^*$$ into $$R(h)$$?
-
-We spent the first week of class minimizing empirical risk, $$R(h)$$. We found that, depending on our choice of loss function, $$h^*$$ ended up being a different measure of the center of our dataset. The point was to show that the values of $$R(h)$$ actually have some meaning as well, and in particular, the smallest possible value of $$R(h)$$ (which is $$R(h^*)$$) happens to describe the spread of our dataset.
-
-
-
-
-
-
-
-In the image above, $$h^*$$ is the $$x$$-coordinate of the vertex (80 and 85). We know what 80 and 85 mean – they're the mean and median of the dataset 72, 90, 61, 85, 92, respectively. What we were trying to give context to is what 150 and 9.9 mean – they're the variance and the mean absolute deviation from the median of our dataset. Both the variance and mean absolute deviation from the median are measurements of spread.
-
-#### Lecture(s) to Review:
-
-- [Lecture 2](https://dsc40a.com/resources/lectures/lec02/lec02-filled.pdf)
-
-### Are there more loss functions outside of what we learned in class?
-
-There are plenty! For example, there's Huber loss, which is like a smoothed version of absolute loss (it's absolute loss, with the corner at the bottom replaced with the bottom of a parabola). There's also cross-entropy loss, also known as "log loss", which is designed for models that predict probabilities (like logistic regression). These, and many more, will come up in future ML classes, like DSC 140A and CSE 158/DSC 148.
-
-#### Lecture(s) to Review:
-
-- N/A
-
-### How do I know which loss function to choose in practice?
-
-As we've seen, different loss functions have different properties. At least with regards to the constant model:
-
-
-
-
-
-
-
-In practice, various models have a "default" choice of loss function. Regression usually uses squared loss, not just because squared loss is easily differentiable, but also because squared loss comes with lots of nice theoretical properties (which you'll learn about in DSC 140A, like the fact that implicitly assumes that the distribution of errors is normal/Gaussian). But depending on your model, you can just try different loss functions and see which ends up creating the model with the best performance!
-
-#### Lecture(s) to Review:
-
-- N/A
-
-### What was the point of the midrange and infinity loss? Will I actually use that in practice?
-
-I've never heard of anyone using $$\lvert y_i - h\rvert^p$$ with $$p \rightarrow \infty$$ as a loss function in practice, so no. But the point of us studying that was for us to get a better understanding of how different loss functions penalize different kinds of errors, and in particular, how the optimal constant prediction is influenced by outliers.
-
-Again, for the constant model $$H(x) = h$$:
-
-- Absolute loss, $$\lvert y_i - h\rvert$$, isn't sensitive to outliers, it's very robust. Remember, the minimizer (the median) was found by finding the $$h$$ where (# points to the left of $$h$$ = # points to the right of $$h$$).
-- Squared loss, $$(y_i - h)^2$$, is more sensitive to outliers. Remember, the minimizer (the mean) was found by finding the $$h$$ where $$-\frac{2}{n} \sum_{i = 1}^n (y_i - h)= 0$$, because $$-\frac{2}{n} \sum_{i = 1}^n (y_i - h)$$ is the derivative of $$R_\text{sq}(h) = \frac{1}{n} \sum_{i = 1}^n (y_i - h)^2$$. Since this is the case, the mean is "pulled" in the direction of the outliers, since it needs to balance the deviations.
-- Following the pattern, $$\lvert y_i - h\rvert^3$$ would be even more sensitive to outliers.
-
-As we keep increasing the exponent, $$\lvert y_i - h\rvert^p$$ creates a prediction that's extremely sensitive to outliers, to the point where its goal is to balance the worst case (maximum distance) from any one point. That's where the midrange comes in – it's in the middle of the data, so it's not too far from any one point.
-
-So while no, you won't really use the idea of "infinity loss" in practice, I hope that by deeply understanding how it works, you'll better understand how loss functions (including those we haven't seen in class, but do exist in the real world) work and impact your predictions.
-
-#### Lecture(s) to Review:
-
-- [Lecture 3](https://dsc40a.com/resources/lectures/lec03/lec03-filled.pdf) (Slide 19)
-
----
-
-## Simple Linear Regression
-
-### In Lecture 4, is the $$x_i$$ not part of the summation since it is out of the parentheses?
-
-The question was referring to a summation like this one:
-
-$$\sum_{i = 1}^n (y_i - w_0 - w_1 x_i) x_i$$
-
-Here, $$x_i$$ is indeed a part of the summation. The sum is of $$n$$ terms, each of which are the form $$(y_i - w_0 - w_1 x_i) \cdot x_i$$. That is, the summation above is equivalent to:
-
-$$\sum_{i = 1}^n \left( (y_i - w_0 - w_1 x_i) x_i \right)$$
-
-On the other hand, the following expression is invalid, since $$x_i$$ doesn't have any meaning when not part of a summation over $$i$$:
-
-$$\left( \sum_{i = 1}^n (y_i - w_0 - w_1 x_i) \right) x_i$$
-
-#### Lecture(s) to Review:
-
-- [Lecture 4](https://dsc40a.com/resources/lectures/lec04/lec04-filled.pdf)
-
-### What was the 3D graph in Lecture 4 about?
-
-
-
-
-
-
-
-On the left, we have the graph of the mean squared error of a constant prediction, $$h$$, on the dataset 72, 90, 61, 85, 92. It shows us that there is some best $$h$$, which we've been calling $$h^*$$, that makes the mean squared error as small as possible. We showed, using calculus, that the value of $$h^*$$ for any dataset is $$\text{Mean}(y_1, y_2, ..., y_n)$$.
-
-On the right, we have the graph of mean squared error of the line $$H(x) = w_0 + w_1 x$$. The dataset is the dataset of departure times and commute times we've been using as our running example. Specifically:
-
-The two axes on the "ground" of the plot represent different intercepts, $$w_0$$, and slopes, $$w_1$$, that we could be using for making predictions.
-
-The height of the graph above any $$(w_0, w_1)$$ pair is $$\frac{1}{n} \sum_{i = 1}^n (y_i - (w_0 + w_1 x_i))^2$$. $$x_i$$ represents the $$i$$th departure time (e.g. 8.5, corresponding to 8:30AM) and $$y_i$$ represents the $$i$$th actual commute time (e.g. 75 minutes).
-
-The point was to show what the function $$R_\text{sq}(w_0, w_1) = \frac{1}{n} \sum_{i = 1}^n (y_i - (w_0 + w_1 x_i))^2$$ actually looks like, before we went to use calculus to minimize it. It kind of looks like a bowl, and has a clearly defined minimum. Calculus helped us find that minimum, which occurs at $$w_0^* = \bar{y} - w_1^* \bar{x}$$ and $$w_1^* = \frac{\sum_{i = 1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i = 1}^n (x_i - \bar{x})^2}$$.
-
-#### Lecture(s) to Review:
-
-- [Lecture 4](https://dsc40a.com/resources/lectures/lec04/lec04-filled.pdf)
-
-### Can we minimize the mean absolute error of the simple linear regression model?
-
-Yes, we can! The issue is just that there doesn't exist a closed-form solution, i.e. a formula, for the optimal $$w_0^*$$ and $$w_1^*$$ in:
-
-$$R_\text{abs}(w_0, w_1) = \frac{1}{n} \sum_{i = 1}^n \lvert y_i - (w_0 + w_1 x_i) \rvert$$
-
-So, we have to use the computer to approximate the answer. Regression with squared loss is called "least squares regression," but regression with absolute loss is called "least absolute deviations regression." You can learn more [here](https://en.wikipedia.org/wiki/Least_absolute_deviations)!
-
-#### Lecture(s) to Review:
-
-- N/A
-
-### Is there a more detailed version of the MSE proof shown in Lecture 5?
-
-Yes. Here's a proof of the fact that $$R_\text{sq}(w_0^*, w_1^*) = \sigma_y^2 (1 - r^2)$$.
-
-First, note that since $$\sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2$$, we have that $$\sum_{i = 1}^n (x_i - \bar{x})^2 = n \sigma_x^2$$. Then:
-
-$$R_{\text{sq}}( w_0^*, w_1^* )$$ = $$\frac{1}{n} \sum_{i=1}^{n} (y_i - \bar{y} - w_1^*(x_i - \bar{x}))^2 $$ \\
-= $$ \frac{1}{n} \sum_{i=1}^{n} \left[ (y_i - \bar{y})^2 - 2 w_1^*(x_i - \bar{x})(y_i - \bar{y}) + w_1^{*2} (x_i - \bar{x})^2 \right] $$ \\
-= $$ \frac{1}{n} \sum_{i=1}^{n} (y_i - \bar{y})^2 - \frac{2w_1^*}{n} \sum_{i=1}^{n} ((x_i - \bar{x})(y_i - \bar{y})) + \frac{w_1^{*2}}{n} \sum_{i=1}^{n} (x_i - \bar{x})^2 $$ \\
- = $$\sigma_y^2 - \frac{2w_1^*}{n} \sum_{i=1}^{n} ((x_i - \bar{x})(y_i - \bar{y})) + w_1^{*2} \sigma_x^2 $$\\
- = $$\sigma_y^2 - \frac{2w_1^*}{n} \frac{\sum_{i=1}^{n} ((x_i - \bar{x})(y_i - \bar{y}))}{\sum_{i=1}^{n} (x_i - \bar{x})^2} (\sum_{i=1}^{n} (x_i - \bar{x})^2) + r^2 \sigma_y^2 $$\\
- = $$\sigma_y^2 - 2w_1^* \frac{\sum_{i=1}^{n} ((x_i - \bar{x})(y_i - \bar{y}))}{\sum_{i=1}^{n} (x_i - \bar{x})^2} \frac{\sum_{i=1}^{n} (x_i - \bar{x})^2}{n} + r^2 \sigma_y^2$$ \\
-= $$\sigma_y^2 - 2w_1^{*2} \sigma_x^2 + r^2 \sigma_y^2 $$ \\
-= $$\sigma_y^2 - 2(r^2\frac{\sigma_y^2}{\sigma_x^2}) \sigma_x^2 + r^2 \sigma_y^2 $$\\
-= $$\sigma_y^2 - 2r^2\sigma_y^2 + r^2 \sigma_y^2 $$\\
-= $$\sigma_y^2 - r^2 \sigma_y^2$$ \\
-= $$\sigma_y^2 (1 - r^2)$$
-
-#### Lecture(s) to Review:
-
-- [Lecture 5](https://dsc40a.com/resources/lectures/lec05/lec05-filled.pdf) (Slide 25)
-
----
-
-## Linear Algebra
-
-### Can you recap the proof of the formula for $$w_1^*$$ that includes $$r$$?
-
-Sure!
-
-Let's start with the formulas for $$w_1^*$$ and $$r$$:
-
-$$
-w_1^* = \frac{\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^n (x_i - \bar{x})^2}
-$$
-
-$$
-r = \frac{1}{n} \sum_{i=1}^n \left(\frac{(x_i - \bar{x})}{\sigma_x}\right) \left(\frac{(y_i - \bar{y})}{\sigma_y}\right).
-$$
-
-Let's try to rewrite the formula for $$w_1^*$$ in terms of $$r$$.
-
-#### Step 1: Numerator
-
-First, we can simplify the formula for $$r$$ by factoring $$\sigma_x \sigma_y$$ out of the summation.
-
-$$
-r = \frac{1}{n \sigma_x \sigma_y} \sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y}).
-$$
-
-Rearranging this gives us the very pretty
-
-$$
-\sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y}) = rn\sigma_x\sigma_y.
-$$
-
-Therefore, the numerator of the formula for $$w_1^*$$ can be expressed as $$rn\sigma_x\sigma_y$$.
-
-
-#### Step 2: Denominator
-
-Let's start with the formula standard deviation $$\sigma_x$$ defined as
-
-$$
-\sigma_x = \sqrt{\frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})^2}.
-$$
-
-This means that the variance $$\sigma_x^2$$ is
-
-$$
-\sigma_x^2 = \frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})^2.
-$$
-
-Multiplying through by $$n$$ gives
-
-$$
-n\sigma_x^2 = \sum_{i=1}^n (x_i - \bar{x})^2.
-$$
-
-Therefore, the denominator of the formula for $$w_1^*$$ can be expressed as $$n\sigma_x^2$$.
-
-#### Conclusion
-
-Putting these parts together, we can rewrite the original formula for $$w_1^*$$:
-
-$$
-w_1^* = \frac{rn\sigma_x\sigma_y}{n\sigma_x^2}.
-$$
-
-And, simplifying this expression, we arrive at a beautiful result:
-
-$$
-w_1^* = r \cdot \frac{\sigma_y}{\sigma_x}.
-$$
-
-#### Lecture(s) to Review
-
-- [Lecture 5](https://dsc40a.com/resources/lectures/lec05/lec05-filled.pdf) (Slide 18)
-
-
-### What do you mean by "the inner dimensions need to match in order to perform matrix multiplication"?
-
-Think about the multiplication of two matrices:
-
-$$
-A = \begin{bmatrix} a_{11} & a_{12} & a_{13}\\ a_{21} & a_{22} & a_{23}\end{bmatrix} \text{ and }
-B = \begin{bmatrix} b_{11} & b_{12} \\ b_{21} & b_{22} \\ b_{31} & b_{32}\end{bmatrix}
-$$
-
-Let's call $$C$$ the product matrix between the two.
-
-As we discussed in lecture, every entry of this resulting matrix $$C$$ will be the result of the dot product of a row of $$A$$ with a column of $$B$$. For example, one entry of the product matrix $$C$$ is formed by dotting $$\begin{bmatrix} a_{11} & a_{12} & a_{13}\end{bmatrix}$$ with $$\begin{bmatrix}b_{11} \\ b_{21} \\ b_{31} \end{bmatrix}$$.
-
-This dot product is only possible if the "length" of each row in $$A$$ is equal to the "height" of each column in $$B$$. In our example, this dot product is defined by
-
-$$
-\begin{bmatrix} (a_{11} \cdot b_{11}) + (a_{12} \cdot b_{21}) + (a_{13} \cdot b_{31}) \end{bmatrix}
-$$
-
-Clearly, if the number of entries in the first row of $$A$$ were not equal to the number of entries in the first column of $$B$$, this dot product would not make sense. For example, say $$B$$ only had $$2$$ rows. Then, when computing the entries of our product $$C$$, we would run into a situation like this:
-
-$$
-\begin{bmatrix} (a_{11} \cdot b_{11}) + (a_{12} \cdot b_{21}) + (a_{13} \cdot \text{...?}) \end{bmatrix}
-$$
-
-And we could not compute the entry of $$C$$, making our matrix multiplication impossible.
-
-In essence, the multiplication of matrices occurs when the inner dimensions of A and B, columns and rows, respectively, match.
-If they do not, the dot product between the rows of $$A$$ and the columns of $$B$$ would not be possible, and we cannot create a product matrix $$C$$.
-
-#### Lecture(s) to Review:
-
-- [Lecture 6](https://dsc40a.com/resources/lectures/lec06/lec06-filled.pdf) (Slide 17)
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf) (Slide 20)
-
-### What's the relationship between spans, projections, and multiple linear regression?
-
-#### Spans
-
-The **span** of a set of vectors $$\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\}$$ is the set of all possible linear combinations of these vectors. In other words, the span defines a subspace in $$\mathbb{R}^n$$ that contains all possible combinations of the independent variables.
-
-$$
-\text{Span}\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\} = \{w_1 \vec{x}_1 + w_2 \vec{x}_2 + \ldots + w_d \vec{x}_d\}.
-$$
-
-In the context of multiple linear regression, the span of the feature vectors represents all possible values that can be predicted using a linear combination of the feature vectors.
-
-#### Projections
-
-A **projection** of the observation vector $$\vec{y}$$ onto the span of the feature vectors $$\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_p\}$$ is any vector $$\vec{h}$$ that lies in this span.
-
-The distance between the observations and the projection of $$\vec{y}$$ into the span of the feature vectors represents the error of a prediction. That is, each projection of $$\vec{y}$$ into the span of the feature vectors is defined by scaling each of the feature vectors by a certain amount ($$w_1$$, $$w_2$$, etc.) and summing them; the distance from this linear combination of the feature vectors to the actual observed values of $$\vec{y}$$ is the error of a certain prediction.
-
-This error is written as
-
-$$
-\vec{e} = \vec{y} - X\vec{w}
-$$,
-
-where $$X$$ represents the design matrix made up of the feature vectors, and $$\vec{w}$$ represents the coefficients that you are scaling the feature vectors by to obtain some projection of $$\vec{y}$$ into the span of $$X$$.
-
-The **orthogonal projection** of $$\vec{y}$$ into $$X$$ is the one that minimizes the error vector (Or the distance between the predicted values of $$\vec{y}$$ and the actual values of $$\vec{y}$$).
-
-#### Multiple Linear Regression
-
-Tying this all together, one can frame multiple linear regression as a projection problem; Given some set of feature vectors $$\vec{x}_1, \vec{x}_2, ... , \vec{x}_d$$, and an observation vector $$\vec{y}$$, what are the scalars $$ w_1, w_2, ... , w_p $$ that give a vector in the span of the feature vectors that is the closest to $$\vec{y}$$?
-
-In other words, how close can we get to the observed values of $$\vec{y}$$, while in the span of our feature vectors?
-
-This framing of multiple linear regression also leads us to the **normal equations**
-
-$$
-\vec{w}^* = (X^\mathrm{T}X)^{-1}X^\mathrm{T}\vec{y}.
-$$
-
-For more visual intuition of this idea, check out this tutor-made animation!
-
-
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf)
-
-- [Lecture 8](https://dsc40a.com/resources/lectures/lec08/lec08-filled.pdf)
-
-### Why does the design matrix have a column of all 1s?
-
-In linear regression, the design matrix $$X$$ represents the features $$x_1, x_2, \ldots, x_d$$. Each row of $$X$$ corresponds to one data point, and each column corresponds to one feature. The parameter vector $$\vec{w}$$, which we multiply by $$X$$ to obtain our predictions $$\vec{h}$$, contains the weights for each feature, including the intercept or bias term $$w_0$$.
-
-The term $$w_0$$ is a constant that helps adjust the linear regression model vertically. This term is universal between predictions. In other words, regardless of the values of the other features $$x_1, x_2, \ldots, x_d$$, the value of $$w_0$$ will be the same. Let's explore how this relates to our design matrix.
-
-When the design matrix $$X$$ is multiplied by the parameter vector $$\mathbf{w}$$, each row of $$X$$ produces a prediction $$h$$ depending on the values of the features in the row. Each value in the row is multiplied by its associated weight in the parameter vector, and the resulting products are summed to form a prediction. However, we want the weight associated with $$w_0$$ to output the same constant bias term no matter the values in $$X$$.
-
-To ensure this, we include a column of 1s at the beginning of the design matrix $$X$$. This column represents the constant contribution of the bias term $$w_0$$, and will always be multiplied by $$w_0$$ when a particular observation is being used to make a prediction. In other words, regardless of the values of the features in $$X$$, every prediction will have $$w_0 \cdot 1$$ added to it.
-Let's give a quick example of this.
-
-Suppose we have a linear regression problem with two features. The design matrix $$X$$ is:
-
-$$
-X = \begin{bmatrix}
-1 & x_{11} & x_{12} \\
-1 & x_{21} & x_{22} \\
-1 & x_{31} & x_{32}
-\end{bmatrix}
-$$
-
-And the parameter vector $$\vec{w}$$ is:
-
-$$
-\vec{w} = \begin{bmatrix}
-w_0 \\
-w_1 \\
-w_2
-\end{bmatrix}
-$$
-
-To obtain the predicted values $$\vec{h}$$:
-
-$$
-\vec{h} = X \vec{w} = \begin{bmatrix}
-1 & x_{11} & x_{12} \\
-1 & x_{21} & x_{22} \\
-1 & x_{31} & x_{32}
-\end{bmatrix} \begin{bmatrix}
-w_0 \\
-w_1 \\
-w_2
-\end{bmatrix} = \begin{bmatrix}
-w_0 + w_1 x_{11} + w_2 x_{12} \\
-w_0 + w_1 x_{21} + w_2 x_{22} \\
-w_0 + w_1 x_{31} + w_2 x_{32}
-\end{bmatrix}
-$$
-
-As you can see in this example, our predictions all included the constant bias term $$w_0$$, because in forming our predictions, $$w_0$$ was always scaled by $$1$$, the first entry in each row of our design matrix. This setup ensures that the intercept is included in the model, and does not interfere with the relationship between the other features and the prediction.
-
-#### Lecture(s) to Review:
-
-- [Lecture 9](https://dsc40a.com/resources/lectures/lec09/lec09-filled.pdf)
-
-### What is the projection of $$\vec{y}$$ onto $$\text{span}(\vec{x})$$ – is it $$w^*$$ or $$w^* \vec{x}$$?
-
-In multiple linear regression, the orthogonal projection of the vector $$\vec{y}$$ onto the span of the vectors $$\{\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(n)}\}$$ is expressed as:
-
-$$
-\vec{h}^* = X\vec{w}^*.
-$$
-
-Here, $$\vec{w}^*$$ is a vector of scalar coefficients ($$ w_1, w_2$$, etc.), and $$X$$ is the design matrix. In other words, $$\vec{w}^*$$ provides the specific coefficients with which to form a linear combinations of your features to make predictions $$\vec{h}^*$$.
-
-So, to answer the question directly: $$w^* \vec{x}$$ is the projection of $$\vec{y}$$ onto $$\text{span}\{\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(n)}\}$$, and $$w^*$$ is the set of scalars used to make this projection when multiplied with $$\vec{x}$$
-
-#### Lecture(s) to Review:
-
-- [Lecture 6](https://dsc40a.com/resources/lectures/lec06/lec06-filled.pdf) (Slide 28)
-
-### Do the normal equations work even when there is only one column in the matrix $$X$$?
-
-
-Yes! Let's look at two different cases where this can occur.
-
-
-#### Case 1: $$X$$ is a column of ones
-
-If $$X$$ is a column of ones, the model $$H(\vec{x}) = w_0$$ fits a constant line through the data. Using the normal equations,
-
-$$
-\vec{1}^T \vec{1} w_0^* = \vec{1}^T \vec{y}.
-$$
-
-$$\vec{1}^T \vec{1} = n$$, where $$n$$ is the number of data points, and $$\vec{1}^T \mathbf{y} = \sum_{i=1}^n y_i$$. Thus, the normal equations become:
-
-$$
-n \cdot w_0^* = \sum_{i=1}^n y_i.
-$$
-
-And, solving for $$w_0^*$$, we get
-
-$$
-w_0^* = \frac{1}{n} \sum_{i=1}^n y_i,
-$$
-
-which is the mean of the target values.
-
-#### Case 2: $$X$$ has different values
-
-Now, let's imagine that $$X$$ is a column vector with different values for each data point, representing a single feature:
-
-$$
-X = \begin{bmatrix}
-x_1 \\
-x_2 \\
-\vdots \\
-x_n
-\end{bmatrix}.
-$$
-
-In this case, the model $$H(x) = w_1^*x$$ fits a line through the origin. The normal equations become
-
-$$
-X^T X w_1^* = X^T \vec{y}.
-$$
-
-Calculating the elements, we have
-
-$$
-\sum_{i=1}^n x_i^2 \cdot w_1^* = \sum_{i=1}^n x_i y_i,
-$$
-
-and the normal equations are reduced to
-
-$$
-w_1^* = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2}.
-$$
-
-So, to answer your question, we can absolutely use the normal equations when our design matrix $$X$$ has only one column!
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf)
-
-- [Lecture 8](https://dsc40a.com/resources/lectures/lec08/lec08-filled.pdf)
-
-
-### When do two vectors in $$\mathbb{R}^2$$ span all of $$\mathbb{R}^2$$? When do $$n$$ vectors in $$\mathbb{R}^n$$ span all of $$\mathbb{R}^n$$?
-
-Two vectors in $$\mathbb{R}^2$$ span all of $$\mathbb{R}^2$$ when they are linearly independent (You cannot express one as a scalar multiple of the other). In other words, if $$\vec{u}$$ and $$\vec{v}$$ are two vectors in $$\mathbb{R}^2$$, they will span all of $$\mathbb{R}^2$$ if $$\vec{u}$$ and $$\vec{v}$$ are not collinear, or on the same line.
-
-Similarly, $$n$$ vectors in $$\mathbb{R}^n$$ span all of $$\mathbb{R}^n$$ when they are linearly independent. This means that no vector in the set can be expressed as a linear combination of the others.
-
-**Intuition**
-
-To span a space means to cover it entirely.
-
-Think of two vectors in $$\mathbb{R}^2$$. If one vector is a scalar multiple of the other, then they both point in the same direction or opposite directions, essentially lying on the same line. This means they can only cover that line and cannot cover any other directions.
-
-In higher dimensions, the same principle applies. For example, in $$\mathbb{R}^3$$, three linearly independent vectors point in different directions and can cover all of three-dimensional space. However, if one is a linear combination of the others, then the three vectors lie on the same plane, and can only span that plane.
-
-#### Lecture(s) to Review:
-
-- [Lecture 6](https://dsc40a.com/resources/lectures/lec06/lec06-filled.pdf) (Slide 26)
-
----
-
-## Multiple Linear Regression
-
-### What's the relationship between spans, projections, and multiple linear regression?
-
-#### Spans
-
-The **span** of a set of vectors $$\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\}$$ is the set of all possible linear combinations of these vectors. In other words, the span defines a subspace in $$\mathbb{R}^n$$ that contains all possible combinations of the independent variables.
-
-$$
-\text{Span}\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_d\} = \{w_1 \vec{x}_1 + w_2 \vec{x}_2 + \ldots + w_d \vec{x}_d\}.
-$$
-
-In the context of multiple linear regression, the span of the feature vectors represents all possible values that can be predicted using a linear combination of the feature vectors.
-
-#### Projections
-
-A **projection** of the observation vector $$\vec{y}$$ onto the span of the feature vectors $$\{\vec{x}_1, \vec{x}_2, \ldots, \vec{x}_p\}$$ is any vector $$\vec{h}$$ that lies in this span.
-
-The distance between the observations and the projection of $$\vec{y}$$ into the span of the feature vectors represents the error of a prediction. That is, each projection of $$\vec{y}$$ into the span of the feature vectors is defined by scaling each of the feature vectors by a certain amount ($$w_1$$, $$w_2$$, etc.) and summing them; the distance from this linear combination of the feature vectors to the actual observed values of $$\vec{y}$$ is the error of a certain prediction.
-
-This error is written as
-
-$$
-\vec{e} = \vec{y} - X\vec{w}
-$$,
-
-where $$X$$ represents the design matrix made up of the feature vectors, and $$\vec{w}$$ represents the coefficients that you are scaling the feature vectors by to obtain some projection of $$\vec{y}$$ into the span of $$X$$.
-
-The **orthogonal projection** of $$\vec{y}$$ into $$X$$ is the one that minimizes the error vector (Or the distance between the predicted values of $$\vec{y}$$ and the actual values of $$\vec{y}$$).
-
-#### Multiple Linear Regression
-
-Tying this all together, one can frame multiple linear regression as a projection problem; Given some set of feature vectors $$\vec{x}_1, \vec{x}_2, ... , \vec{x}_d$$, and an observation vector $$\vec{y}$$, what are the scalars $$ w_1, w_2, ... , w_p $$ that give a vector in the span of the feature vectors that is the closest to $$\vec{y}$$?
-
-In other words, how close can we get to the observed values of $$\vec{y}$$, while in the span of our feature vectors?
-
-This framing of multiple linear regression also leads us to the **normal equations**
-
-$$
-\vec{w}^* = (X^\mathrm{T}X)^{-1}X^\mathrm{T}\vec{y}.
-$$
-
-For more visual intuition of this idea, check out this tutor-made animation!
-
-
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf)
-
-- [Lecture 8](https://dsc40a.com/resources/lectures/lec08/lec08-filled.pdf)
-
-### When $$X^TX$$ isn't invertible, how do we solve the normal equations?
-
-When $$X^TX$$, we cannot solve the normal equations using traditional methods. That is, if we cannot invert $$X^TX$$, we cannot solve $$w = (X^\mathrm{T}X)^{-1}X^\mathrm{T}y$$.
-
-Generally, this situation arises when one of the columns of our design matrix $$X$$ is a linear combination of the other columns in $$X$$. This leads to an infinite amount of solutions satisfying the normal equations, and so finding a unique solution is impossible. However, if you are interested in other methods with which to solve the normal equations when $$X$$ is not invertible, feel free to explore them! As a starting point, try researching the Moore-Penrose pseudo-inverse and ridge regression as two other approaches to solving for an optimal parameter vector!
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf) (Slide 34)
-
-### What does it mean for a matrix to be full rank?
-
-A matrix is full rank when each column in the matrix is linearly independent.
-
-In linear regression, the design matrix $$X$$ must be full rank to have a unique solution for the normal equations. If $$X$$ is not full rank, it implies multicollinearity among the features, which leads to an infinite amount of solutions when solving for the optimal parameters $$\vec{w}^*$$. For clarity:
-
-- **Full Rank:**
- If the design matrix $$X$$ is full rank, then all of its columns are linearly independent. This allows the normal equations:
-
- $$
- X^T X \vec{w}^* = X^T \vec{y}
- $$
-
- to have a unique solution.
-
-- **Not Full Rank:**
- If $$X$$ is not full rank, then some columns of $$X$$ are linear combinations of other columns. This leads to multicollinearity, which results in infinitely many solutions for the normal equations.
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf)
-
-
-### In multiple linear regression, is $$\vec{h}^*$$ orthogonal to $$\vec{y}$$?
-
-$$\vec{h}^*$$ is the optimal hypothesis vector; That is, $$\vec{h}^* = X\vec{w}^*$$. This means that $$\vec{h}^*$$ is the orthogonal projection of our observation vector $$\vec{y}$$ into the span of our feature vectors $$\vec{x}^{(1)}, \vec{x}^{(2)}, ..., \vec{x}^{(d)}$$. As such, $$\vec{h}^*$$ is orthogonal to the error vector $$\vec{e} = \vec{y} - \vec{h}^*$$. However, this relationship does not imply orthogonality with $$\vec{y}$$, or any other vector aside from the error vector $$\vec{e}$$
-
-#### Lecture(s) to Review:
-
-- [Lecture 7](https://dsc40a.com/resources/lectures/lec07/lec07-filled.pdf)
-
-- [Lecture 8](https://dsc40a.com/resources/lectures/lec08/lec08-filled.pdf)
-
-
-### Why does the multiple linear regression model with two features look like a plane?
-
-When we perform multiple linear regression with two features, we take information from two independent variables and predict some value for our target variable. Let's think about this relates to a plane, both algebraically and geometrically.
-
-Algebraically, if our features are $$x_1$$ and $$x_2$$, our prediction function takes the form
-
-$$
-H(\vec{x}) = w_0^* + w_1^*x_1 + w_2^*x_2,
-$$
-
-which is the general formula for a plane.
-
-Geometrically, multiple linear regression with two features is the same idea:
-
-Each feature ($$x_1$$ and $$x_2$$) corresponds to one axis, and our target variable (the variable we are trying to predict), is represented by the vertical axis. When we vary the values on the $$x_1$$ and $$x_2$$ axes, we are exploring the values of our prediction function when we can vary $$2$$ features- this forms a $$2$$-dimensional surface.
-
-If this question also concerns why these predictions form a plane instead of some other surface, perhaps with curves or bends, we can also briefly address that. In a linear regression model, the relationship between the input features and the target variable is linear. This means that the predicted value is a linear combination of the input features, with each feature having a fixed weight (or coefficient). Another way to say this is that in a linear model with $$2$$ feature vectors, our predictions must be within the span of our feature vectors. In $$3$$ dimensions, this span is a plane (This concept is addressed in lectures 5 and 6, if you want a refresher on span!)
-
-If we had a nonlinear prediction function with $$2$$ features, we could see a prediction function that forms a curved surface in $$3$$ dimensions. However, as a consequence of performing linear regression, our prediction function will form a plane.
-
-For more visual intuition of this idea, check out the first 35 seconds of this video!
-
-
-
-#### Lecture(s) to Review:
-
-- [Lecture 6](https://dsc40a.com/resources/lectures/lec06/lec06-filled.pdf) (Slide 23)
-
-- [Lecture 8](https://dsc40a.com/resources/lectures/lec08/lec08-filled.pdf) (Slide 30)
-
----
-
-
-
-
-
-
diff --git a/pages/playlists.md b/pages/playlists.md
deleted file mode 100644
index e28199d..0000000
--- a/pages/playlists.md
+++ /dev/null
@@ -1,31 +0,0 @@
----
-layout: page
-title: 🎵 Playlists
-description: DSC 100 music, curated by your wonderful course staff!
-nav_order: 8
-nav_exclude: true
----
-
-# 🎵 Playlists
-
-Welcome to our theorem-thumping playlist—perfect for tackling 40a material and getting in the mode to do some math. Every track has been carefully derived, ensuring your productivity converges. Rest easy! Each beat is rigorously optimized by our in-house team of audiophile-maticians, so your mind can flow freely from gradient descent to your next big combinatoric breakthrough. Put on your headphones, hit play, and let these tunes transform your study sessions into a full-rank masterpiece. In this playlist, there are no null vectors—only real solutions that keep you in perfect rhythm. Enjoy, and keep integrating those good vibes!
-
-## Class Kickoff
-
-A curated playlist from the musical mind of Jiesen! Welcome to DSC40a.
-
-
-
-## Study Session Jams
-
-7 Hours long, just enough to get your through a full work day... minus meetings.
-
-
-
-
-
-## Tunes for Deadlines!
-
-### 60min Math Crunch
-
-
diff --git a/pages/resources.md b/pages/resources.md
index bce5828..48fcf09 100644
--- a/pages/resources.md
+++ b/pages/resources.md
@@ -2,7 +2,7 @@
layout: page
title: 📚 Resources
description: Useful links and resources.
-has_children: true
+has_children: false
permalink: /resources/
nav_order: 5
---
@@ -11,102 +11,67 @@ nav_order: 5
---
-## Past Lecture Videos (by Janine Tiefenbruck)
-
-In the table below, you can find lecture videos created by Janine Tiefenbruck, who created this course and taught it many times. The lecture videos linked below will generally be pretty similar in content coverage to our lectures, but there are indeed differences in notation and order. You are responsible for everything covered in OUR lectures, even if something doesn't appear in the videos below. When in doubt, refer to the main lecture slides & videos posted in Home and ask questions on Piazza.
-
-| **Video** | **Topics** |
-| --- | --- |
-|[Video 1](https://youtu.be/6tP6crJr32U) | learning from data, mean absolute error|
-|[Video 2](https://youtu.be/ad2S7XnCSVc) | minimizing mean absolute error |
-|[Video 3](https://youtu.be/LYJW_2odH_E) | mean squared error |
-|[Video 4](https://youtu.be/usam2JTOaLg) | empirical risk minimization, general framework, 0-1 loss |
-|[Video 5](https://youtu.be/Syw_PfmWDRg) | UCSD loss |
-|[Video 6](https://youtu.be/F2ImJ3dkkZ8) | gradient descent |
-|[Video 7](https://youtu.be/1TjwPNY2Gzw) | gradient descent demo, convexity |
-|[Video 8](https://youtu.be/NdkDK3Jb6SY) | spread |
-|[Video 9](https://youtu.be/3RiaKo2jGIk) | linear prediction rule |
-|[Video 10](https://youtu.be/Ac1EFASUA9M)| least squares solutions |
-|[Video 11](https://youtu.be/0sWcrJSAUFQ)| regression interpretation |
-|[Video 12](https://youtu.be/bTp4vMu_9N0)| nonlinear trends |
-|[Video 13](https://youtu.be/7k3KtI4NFas)| linear algebra for regression |
-|[Video 14](https://youtu.be/2ebdHtxb4as)| gradient, normal equations |
-|[Video 15](https://youtu.be/uIbnLq6IZLI)| polynomial regression, nonlinear trends |
-|[Video 16](https://youtu.be/tuezO9tiXnE)| multiple regression |
-|[Video 17](https://youtu.be/dDn6iPpbH4E)| k-means clustering |
-|[Video 18](https://youtu.be/UPxe97Wc1gM)| k-means clustering, cost function, practical considerations |
-|[Video 19](https://youtu.be/ikLzykAaLOk)| probability, basic rules |
-|[Video 20](https://youtu.be/qHOG3yc4UzE)| conditional probability |
-|[Video 21](https://youtu.be/-3v6UZ_Cq9k)| probability, random sampling, sequences |
-|[Video 22](https://youtu.be/AfM9akq6PL0)| combinatorics, sequences, sets, permutations, combinations |
-|[Video 23](https://youtu.be/fuaFj7aeg9I)| counting and probability practice |
-|[Video 24](https://youtu.be/I3ZHwf8qWS4)| law of total probability, Bayes' Theorem |
-|[Video 25](https://youtu.be/AUiX4gWWsuE)| independence, conditional independence |
-|[Video 26](https://youtu.be/WLQMoA4ZAus)| naive Bayes |
-|[Video 27](https://youtu.be/4tGtziW901Y)| text classification, spam filter, naive Bayes |
+## COGS 108 Course Resources
----
-
-## Tutor-Created Supplemental Resources
-
-These resources were created by tutors as part of their Final Project for DSC 95, the first-time tutor training course.
-
-### Getting Started
-- Maryam's [tutorial on using LaTeX](https://www.youtube.com/watch?v=_nBsdQpsEiE)
-- Charlie Sun's [example midterm notes sheet](https://drive.google.com/file/d/1XRW26AeMzdvOu-LNks-4sso6e1xXcH4b/view?usp=sharing)
-- Candus Shi's [slides](https://docs.google.com/presentation/d/1_SvEkGPlfel_NfSg4w-M5HvEa8UdO24l9FW4VokmuaQ/edit?usp=sharing) and [video on the importance of math as a data scientist](https://drive.google.com/file/d/1dMej66xoUzj5g10XvOn4i4fq5Ri1-HQ3/view?usp=sharing)
-
-### Regression & Linear Algebra
-- Alan Wang's [video showing the equivalence of the various regression formulas](https://youtu.be/h2qMB1g9zSQ?si=55ziwlomsIFkC9d7)
-- Brighten Hayama and Yosen Lin's [linear regression overview](https://drive.google.com/file/d/1ayBd1EdM5O4jmPgi0DtM2eWKNp5262nD/view?usp=sharing)
-
-### Probability & Combinatorics
-- Pallavi Prabhu's [permutations and combinations guide](https://drive.google.com/file/d/1v5AFjUSzeleVQBe2m2Wd27ViSUC00sY6/view?usp=sharing)
-- Varun Pabreja's [video on solving combinatorics problems](https://www.youtube.com/watch?v=8PndRAcRC1Q), including a fun application
-- Pranav Rebala's [conditional probability presentation](https://docs.google.com/presentation/d/1s-W4NTHwfBKBFdB1fjqDXmWPk69G0K0nse7ukfRQ2bw/edit?usp=sharing)
-- Benjamin Xue's [visualizing independence guide](https://docs.google.com/document/d/e/2PACX-1vTV_h_3yrVwY5JAZ2ZIyKQ6J84t3jsHtaoEMDpduumvtWu5idmf1timb9SfrEpqMdqqkYPvpjqned8Z/pub)
-- Javier Ponce's [probability guide](https://docs.google.com/presentation/d/1INFkQA6H06NEicGpzEMornQWaAwYb1zUVRPOF9nGSDw/edit?usp=sharing)
-- Utkarsh Lohia's [video on the Naïve Bayes classifier](https://www.youtube.com/watch?v=3yCu_l2uBAw)
+This page collects useful links and supplemental resources for COGS 108. These materials are meant to support the main course content, projects, and lectures. You are responsible for the material covered in this course, so use these resources as supplements rather than replacements.
+When in doubt, refer to the lecture materials, course announcements, and official course communication channels.
---
-## Loss Functions and Regression
+## Getting Started
-- Other lectures on [Loss Functions](http://ds100.org/su20/lecture/lec11) and [Simple Linear Regression](http://ds100.org/su20/lecture/lec12/).
- - These are from a different course for a different audience, and use different notation and terminology. However, the high-level ideas are similar to those in the first few weeks of our course.
-- [Gradient Descent visualizer](https://uclaacm.github.io/gradient-descent-visualiser/#playground).
+- [Markdown Guide](https://www.markdownguide.org/)
+- [GitHub Docs](https://docs.github.com/)
+- [Jupyter Notebook Documentation](https://docs.jupyter.org/)
+- [Python Documentation](https://docs.python.org/3/)
+- [Pandas Documentation](https://pandas.pydata.org/docs/)
+- [Matplotlib Documentation](https://matplotlib.org/stable/index.html)
+- [Seaborn Documentation](https://seaborn.pydata.org/)
---
-## Probability
+## Data Science and Analysis
-Unlike the first half of the course, where we had course notes written specifically for this class, we don't have COGS 108-specific notes for the second half of the class, because there are many high-quality resources available online that cover the same material. Below, you'll find links to some of these resources.
+- [UC Berkeley Data 8 Textbook](https://inferentialthinking.com/)
+- [OpenIntro Statistics](https://www.openintro.org/book/os/)
+- [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
+- [Pandas User Guide](https://pandas.pydata.org/docs/user_guide/index.html)
+- [Scikit-learn User Guide](https://scikit-learn.org/stable/user_guide.html)
-### Readings and Sources of Practice Problems
+---
-- [Open Intro Statistics](https://leanpub.com/os): Sections 2.1, 2.3, and 2.4 cover the probability we are learning in this course at a good level for undergraduates. This is a good substitute for a textbook, similar to the course notes that we had for the first part of the course. It goes through the definitions, terminology, probability rules, and how to use them. It's succinct and highlights the most important things.
+## Data Visualization
-- [Probability for Data Science](https://textbook.prob140.org/): Chapters 1 and 2 of this book have a lot of good examples demonstrating some standard problem-solving techniques. This book should be primarily useful for more problems to practice and learn from. This book is written at a good level for students in this class. It is used at UC Berkeley in their Probability for Data Science course. Our course only really covers material from the first two chapters, but if you want to extend your learning of probability as it applies to data science, this is a good book to help you do that.
+- [Fundamentals of Data Visualization](https://clauswilke.com/dataviz/)
+- [Matplotlib Tutorials](https://matplotlib.org/stable/tutorials/index.html)
+- [Seaborn Tutorial](https://seaborn.pydata.org/tutorial.html)
+- [Plotly Python Documentation](https://plotly.com/python/)
-- [Theory Meets Data](http://stat88.org/textbook/content/intro.html): Chapters 1 and 2 of this book cover similar content to Chapters 1 and 2 of the Probability for Data Science book, but with different prose and examples. It is used at UC Berkeley for a more introductory Probability for Data Science course.
+---
-- [Grinstead and Snell's Introduction to Probability](https://cse103.github.io/Resources/GrinsteadSnell.pdf): Chapters 1, 3, and 4.1 of this book cover the material from our class. This book is a lot longer and more detailed than the others, and it uses more formal mathematical notation. It should give you a very thorough understanding of probability and combinatorics, but it is a lot more detailed, so the more abbreviated resources above will likely be more useful. With that said, this book is written at a good level for undergraduates and is used in other undergraduate probability classes at UCSD, such as CSE 103.
+## Project and Research Skills
-- [Introduction to Mathematical Thinking](http://imt-decal.org): This course covers topics in discrete math, some of which are relevant to us (in particular, set theory and counting). In addition to the lecture videos linked on the homepage, you may want to look at the [notes section](http://notes.imt-decal.org).
+- [UCSD Library Research Guides](https://ucsd.libguides.com/)
+- [Google Dataset Search](https://datasetsearch.research.google.com/)
+- [Kaggle Datasets](https://www.kaggle.com/datasets)
+- [UCI Machine Learning Repository](https://archive.ics.uci.edu/)
+- [Data.gov](https://data.gov/)
-- [Khan Academy: Counting, Permutations, and Combinations](https://www.khanacademy.org/math/statistics-probability/counting-permutations-and-combinations#combinatorics-probability): Khan Academy has a good unit called Counting, Permutations, and Combinations that should be pretty helpful for the combinatorics we are learning in this class. A useful aspect of it is the practice questions that combine permutations and combinations. Most students find that the hardest part of these counting problems is knowing when to use permutations and when to use combinations. These practice questions have them mixed together, so you really get practice learning which is the right technique to apply to which situation.
+---
-### Visualizations
+## Writing and Communication
-- [Conditional probability: A Visual explanation by Victor Powell for Setosa](https://setosa.io/conditional/)
-- [Seeing Theory](https://seeing-theory.brown.edu)
+- [UCSD Writing Hub](https://writinghub.ucsd.edu/)
+- [Purdue OWL](https://owl.purdue.edu/)
+- [The Craft of Scientific Writing](https://www.nature.com/scitable/ebooks/english-communication-for-scientists-14053993/)
---
-## Past Exams
+## Past Exams and Practice
-Some past exam problems can be found at [practice.dsc40a.com](https://practice.dsc40a.com).
+Practice materials will be posted here if they are made available for this course.
---
-If you find another helpful resource, let us know and we can link it here!
\ No newline at end of file
+
+If you find another helpful resource, let us know and we can link it here!
\ No newline at end of file
diff --git a/pages/resources/grokking_bayes.md b/pages/resources/grokking_bayes.md
deleted file mode 100644
index 3446669..0000000
--- a/pages/resources/grokking_bayes.md
+++ /dev/null
@@ -1,520 +0,0 @@
----
-layout: none
-title: Grokking Bayes'
-has_children: false
-parent: 📚 Resources
-nav_order: 1
-permalink: /resources/grokking-bayes
----
-
-
-
-
-
- Grokking Bayes' Theorem
-
-
-
-
-
-
-
-
-
Grokking Bayes' Theorem
-
-
This guide walks through a practical example that illustrates Bayes' theorem using a college major scenario. Let's explore how our intuition can sometimes lead us astray and how Bayes' theorem helps us reason correctly about probabilities.
-
-
The Problem Statement
-
-
Consider the following description of a person named Ahmed:
-
-
-
Ahmed is outgoing and confident.
-
-
-
-
Question 1: Based solely on this description, which seems more likely?
-
-
Ahmed is a communications major
-
Ahmed is a STEM major
-
-
Take a moment to think about your intuitive answer before proceeding.
-
-
-
Examining Our Intuition
-
-
Many people intuitively choose "communications major" because the description seems to match stereotypical traits we associate with students in communications programs. This is using what psychologists call the representativeness heuristic - judging probability by how well something matches our mental prototype.
-
-
-
Question 2: What critical information are we missing when we make this judgment if we were to disregard his description?
-
-
-
- Click for answer
-
We're missing the base rates - how common communications majors and STEM majors are in the college population. This is crucial information for making an accurate probability assessment.
-
-
-
Adding Base Rates
-
-
In many colleges and universities, STEM majors significantly outnumber communications majors:
-
-
Approximately 15% of students are communications majors
-
Approximately 85% of students are STEM majors
-
-
-
This means STEM majors are roughly 5-6 times more common than communications majors in the overall student population.
-
-
-
Question 3: How should this information affect our probability estimate?
-
-
-
- Click for answer
-
This drastically changes the calculation. Even if the description matches communications majors better, we need to account for the fact that we're much more likely to randomly select a STEM major than a communications major from the student population.
-
-
-
Visualizing the Problem
-
-
Let's represent our student population as a grid where:
-
-
Communications majors are represented in orange (a smaller portion on the left)
-
STEM majors are represented in blue (the much larger remaining portion)
-
-
-
-
-
-
-
-
-
- The Communications vs. STEM Major Problem
-
-
- Base Rates in College Population
-
-
-
-
-
- 30
- Communications (15%)
-
-
-
- 170
- STEM (85%)
-
-
-
-
- 👤
- 👤
- 👤
- 👤
- 👤
-
-
- 👤
- 👤
- 👤
- 👤
- 👤
-
-
-
-
- Students Matching "Outgoing and Confident" Description
-
-
-
-
-
-
-
- 70%
-
-
-
-
-
- 30%
-
-
-
- 21
- 51
-
-
-
-
- P(Comm given description) =
- 21 / (21 + 51) ≈ 29.2%
-
-
-
- Students Who Match the Description
-
-
-
-
-
- 21
-
-
-
- 51
-
-
-
- Even though a higher percentage of Communications majors match the description (70% vs 30%),
-
-
- there are still more STEM majors who match because STEM majors are much more common.
-
-
-
-
-
-
Question 4: If we were to randomly select a student from this population, what is the probability they would be a communications major?
-
-
-
- Click for answer
-
About 15%. In our simplified visualization, we're showing 30 communications majors out of a total of 200 students (30 communications + 170 STEM).
-
-
-
Accounting for the Description
-
-
Now let's consider how well the description matches each major:
-
-
Suppose about 70% of communications majors are outgoing and confident (21 out of our 30 communications majors)
-
Suppose only about 30% of STEM majors are outgoing and confident (51 out of our 170 STEM majors)
-
-
-
We can visualize this by highlighting the portion of each group that matches the description, as shown in the middle section of our visualization.
-
-
-
Question 5: Even though a higher percentage of communications majors match the description, why might there still be more STEM majors who match it?
-
-
-
- Click for answer
-
Because STEM majors greatly outnumber communications majors in the total population. In this case, even though only 30% of STEM majors match the description compared to 70% of communications majors, the absolute number is still larger: 51 STEM majors vs. 21 communications majors.
-
-
-
Calculating with Bayes' Theorem
-
-
Let's use the numbers from our visualization:
-
-
We have 30 communications majors in our population
-
We have 170 STEM majors
-
70% of communications majors (21 people) match the description
-
30% of STEM majors (51 people) match the description
-
-
-
-
Question 6: Among students matching the description, how many are communications majors and how many are STEM majors?
-
-
-
- Click for answer
-
21 communications majors and 51 STEM majors match the description.
-
-
-
-
Question 7: What is the probability that someone matching the description is a communications major?
Even though the description matches communications majors at a higher rate, the probability is still only about 29% that Ahmed is a communications major.
-
-
-
As shown in the visualization, even though the description is more representative of communications majors, there are still many more STEM majors who match it simply because STEM majors are so much more common.
-
-
Formalizing with Bayes' Theorem
-
-
Bayes' theorem provides the mathematical framework for this kind of reasoning:
-
-
-
$$P(A|B) = \frac{P(B|A) \times P(A)}{P(B)}$$
-
-
-
Where:
-
-
$$P(A|B)$$ is the probability of A given B has occurred (posterior)
-
$$P(B|A)$$ is the probability of B given A (likelihood)
The probability increases, but it's still more likely that Ahmed is a STEM major.
-
-
-
The Heart of Bayes' Theorem
-
-
The fundamental insight of Bayesian reasoning is that we need to consider both:
-
-
How likely each hypothesis is to begin with (prior probabilities)
-
How well the evidence fits each hypothesis (likelihoods)
-
-
-
-
-
-
-
-
- The Heart of Bayes' Theorem
-
-
-
- All possibilities
-
-
-
-
-
-
-
-
-
-
-
- 🎤
- 🔬
-
-
-
-
- All possibilities
- fitting the description
-
-
-
-
-
-
-
-
-
-
-
-
-
- 🎤
- 🔬
-
-
-
-
- P
- (
- Comm
- given
- description
- )
-
-
-
-
-
-
-
-
-
-
-
-
- +
-
-
-
-
-
- If this line of reasoning, where seeing new evidence restricts the space of possibilities, makes
-
-
- sense to you, then congratulations! You understand the heart of Bayes' theorem.
-
-
-
-
-
As shown in the visualization:
-
-
The left panel shows all possibilities with their original distribution
-
The middle panel shows how the evidence restricts the space of possibilities
-
The right panel shows how we calculate the final probability within that restricted space
-
-
-
-
Question 9: Why is it incorrect to only consider how well the description matches each major?
-
-
-
- Click for answer
-
Because we'd be ignoring the base rates - how common each major is in the student population. This leads to the base rate fallacy, where we overemphasize the matching characteristics and undervalue the prior probabilities.
-
-
-
Real-World Applications
-
-
Bayes' theorem is crucial in many real-world scenarios:
-
-
-
Question 10: How might this type of reasoning be relevant when interpreting personality test results?
-
-
-
- Click for answer
-
When a personality test suggests someone has traits often associated with a certain profession or personality type, we should consider not just how well the traits match the stereotype, but also how common that profession or personality type is in the general population.
-
-
-
-
Question 11: How could Bayesian reasoning help with making predictions about student success in different programs?
-
-
-
- Click for answer
-
When trying to predict which program a student might succeed in based on their traits, we should consider both how well their traits match successful students in each program AND how many students succeed in each program overall. A program with higher overall success rates might be a better bet even if another program seems to match their traits slightly better.
-
-
-
Conclusion
-
-
The communications/STEM major example highlights a common error in probabilistic reasoning - ignoring base rates. Bayes' theorem provides a formal framework for combining prior knowledge with new evidence to reach more accurate conclusions.
-
-
Remember:
-
-
Intuition often focuses on how well evidence matches our hypotheses
-
Proper Bayesian reasoning requires also considering how common each hypothesis is to begin with
-
When base rates are extreme, they can outweigh even strong evidence to the contrary
-
-
-
-
Final Question: Can you think of a situation in your own life where you might have fallen prey to the base rate fallacy? How could you apply Bayesian reasoning to avoid this error in the future?