Gradient Descent Algoritması

29.08.2025

$f(x)$ çok değişkenli bir fonksiyon olmak üzere bir a noktasında türevlenebilir ise, $f(x)$ a’nın negatif gradyanı yönünde gidildiğinde en hızlı şekilde azalır. Burada mantık, yapılan tahmin $\hat{y}$ ile gerçek değer $y$ arasındaki farkı azaltmak, yani modelimizin tahminini en yüksek düzeye çıkarmak. Burada elbette ki overfitting yapmadan düzgün bir doğruluk seviyesine çıkmak istiyoruz.

a_{n + 1} = a_n - \eta \nabla f(a_n)

Buna göre her bir iterasyonda hata oranı azar azar azaltılır. Önemli bir detay, öğrenme oranı $\eta$ çok yüksek tutmamaktır. Yüksek bir öğrenme oranında değişim oranı çok yüksek olacağı için yerel/total minimum noktasına yaklaşmak bir yana, ondan uzaklaşma ihtimalimiz artar. Eğitilmiş bir modelde zikzaklı çizgiler çıkmasının sebebi de öğrenme oranı olan $\eta$ ‘nın çok yüksek olmasıdır.

Gradient descent algoritmasının sırasıyla iki farklı dilde gösterimi:

Python

import numpy as np
 
def gradient_descent(x, y, learning_rate=0.01, num_iterations=1000):
    # Number of training examples
    m = len(y)
    
    # Initialize parameters (theta0, theta1)
    theta = np.zeros(2)
    
    # Gradient descent loop
    for _ in range(num_iterations):
        # Calculate predictions
        y_pred = theta[0] + theta[1] * x
        
        # Calculate gradients
        grad0 = (1/m) * np.sum(y_pred - y)
        grad1 = (1/m) * np.sum((y_pred - y) * x)
        
        # Update parameters
        theta[0] = theta[0] - learning_rate * grad0
        theta[1] = theta[1] - learning_rate * grad1
    
    return theta
 
# Example usage
if __name__ == "__main__":
    # Sample data
    x = np.array([1, 2, 3, 4, 5])
    y = np.array([2, 4, 5, 4, 5])
    
    # Run gradient descent
    theta = gradient_descent(x, y)
    print(f"Optimal parameters: theta0 = {theta[0]:.3f}, theta1 = {theta[1]:.3f}")

C++

#include <iostream>
#include <vector>
 
std::vector<double> gradient_descent(const std::vector<double>& x, 
                                    const std::vector<double>& y, 
                                    double learning_rate = 0.01, 
                                    int num_iterations = 1000) {
    // Number of training examples
    int m = y.size();
    
    // Initialize parameters (theta0, theta1)
    double theta0 = 0.0;
    double theta1 = 0.0;
    
    // Gradient descent loop
    for (int i = 0; i < num_iterations; i++) {
        double grad0 = 0.0;
        double grad1 = 0.0;
        
        // Calculate gradients
        for (int j = 0; j < m; j++) {
            double y_pred = theta0 + theta1 * x[j];
            grad0 += (y_pred - y[j]);
            grad1 += (y_pred - y[j]) * x[j];
        }
        
        grad0 /= m;
        grad1 /= m;
        
        // Update parameters
        theta0 = theta0 - learning_rate * grad0;
        theta1 = theta1 - learning_rate * grad1;
    }
    
    return {theta0, theta1};
}
 
int main() {
    // Sample data
    std::vector<double> x = {1, 2, 3, 4, 5};
    std::vector<double> y = {2, 4, 5, 4, 5};
    
    // Run gradient descent
    std::vector<double> theta = gradient_descent(x, y);
    std::cout << "Optimal parameters: theta0 = " << theta[0] 
              << ", theta1 = " << theta[1] << std::endl;
    
    return 0;
}