-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'geekan:main' into feat-opt_docs
- Loading branch information
Showing
19 changed files
with
2,908 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,4 +15,5 @@ src/*/blog | |
src/*/rfcs | ||
src/*/DataInterpreter | ||
src/*/demos | ||
src/*/sela | ||
src/utils/diff.ts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[ | ||
{ | ||
"id": "0", | ||
"parent_id": null, | ||
"avg_score": "73.7", | ||
"dev_score": "83.4", | ||
"visits": 10, | ||
"order": 1, | ||
"instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", | ||
"code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", | ||
"active": true | ||
}, | ||
{ | ||
"id": "0-4", | ||
"parent_id": "0", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "1.5", | ||
"instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", | ||
"code": "", | ||
"active": false | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
[ | ||
{ | ||
"id": "0", | ||
"parent_id": null, | ||
"avg_score": "73.7", | ||
"dev_score": "83.4", | ||
"visits": 10, | ||
"order": 1, | ||
"instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", | ||
"code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", | ||
"active": true | ||
}, | ||
{ | ||
"id": "0-3", | ||
"parent_id": "0", | ||
"avg_score": "81.6", | ||
"dev_score": "81.0", | ||
"visits": 4, | ||
"order": 2, | ||
"instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", | ||
"code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", | ||
"active": true | ||
}, | ||
{ | ||
"id": "0-3-0", | ||
"parent_id": "0-3", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "2.5", | ||
"instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", | ||
"code": "", | ||
"active": false | ||
}, | ||
{ | ||
"id": "0-3-1", | ||
"parent_id": "0-3", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "2.5", | ||
"instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", | ||
"code": "", | ||
"active": false | ||
}, | ||
{ | ||
"id": "0-3-2", | ||
"parent_id": "0-3", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "2.5", | ||
"instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", | ||
"code": "", | ||
"active": false | ||
}, | ||
{ | ||
"id": "0-3-4", | ||
"parent_id": "0-3", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "2.5", | ||
"instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", | ||
"code": "", | ||
"active": false | ||
}, | ||
{ | ||
"id": "0-4", | ||
"parent_id": "0", | ||
"avg_score": "0.0", | ||
"dev_score": "0.0", | ||
"visits": 0, | ||
"order": "1.5", | ||
"instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", | ||
"code": "", | ||
"active": false | ||
} | ||
] |
Oops, something went wrong.