From 11dcdb74ebc78695687ab150f89e3b22cb564e2f Mon Sep 17 00:00:00 2001 From: Hayden McAfee Date: Sun, 12 Nov 2023 00:02:27 -0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=91=20Stop=20using=20MyPurdue=20pages?= =?UTF-8?q?=20that=20require=20auth=20(#57)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent changes to Purdue's authentication processes have made scraping MyPurdue pages that require authorization not feasible. This change updates the scraping process to avoid these pages and resort to workarounds (such as manually defined data mapping tables) or omitting data entirely (such as enrollment information). See issues #54, #55, #56 for more information on the changes to available data. --- README.md | 13 +- src/CatalogSync/Program.cs | 21 +- .../Connections/IMyPurdueConnection.cs | 4 - src/Scraper/Connections/MyPurdueConnection.cs | 181 ++---- src/Scraper/MyPurdueScraper.cs | 533 ++++++++++-------- src/Tests/ParsingTests.cs | 14 +- 6 files changed, 354 insertions(+), 412 deletions(-) diff --git a/README.md b/README.md index 6445767..615047e 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ there through the query tester at [http://api.purdue.io/](api.purdue.io/). ## Tools -Purdue.io is written in C# on .NET 5. It will run natively on most major +Purdue.io is written in C# on .NET 8. It will run natively on most major architectures and operating systems (Windows, Linux, Mac OS). Entity Framework is used to communicate with an underlying database provider. Currently, @@ -71,10 +71,7 @@ To start developing locally, install the .NET SDK. CatalogSync is the process used to pull course data from MyPurdue and synchronize it to a relational database store. -In order to access detailed course section information, CatalogSync requires a valid -MyPurdue username and password. - -CatalogSync also accepts options to configure which database provider and connection it uses. +CatalogSync accepts options to configure which database provider and connection it uses. Additional flags are available to configure CatalogSync behavior. Use the `--help` flag for more information. @@ -83,10 +80,10 @@ Use the `--help` flag for more information. cd src/CatalogSync # To sync to default SQLite file purdueio.sqlite -dotnet run -- -u USERNAME -p PASSWORD +dotnet run # To sync to a specific SQLite file -dotnet run -- -u USERNAME -p PASSWORD -d Sqlite -c "Data Source=path/to/file.sqlite" +dotnet run -- -d Sqlite -c "Data Source=path/to/file.sqlite" ``` CatalogSync will begin synchronizing course catalog data to `purdueio.sqlite`. @@ -96,7 +93,7 @@ and connection string: ```sh # To sync to a local PostgreSQL instance: -dotnet run -- -u USERNAME -p PASSWORD -d Npgsql -c "Host=localhost;Database=purdueio;Username=purdueio;Password=purdueio" +dotnet run -- -d Npgsql -c "Host=localhost;Database=purdueio;Username=purdueio;Password=purdueio" ``` ## API diff --git a/src/CatalogSync/Program.cs b/src/CatalogSync/Program.cs index 519c6b9..8fa4a4a 100644 --- a/src/CatalogSync/Program.cs +++ b/src/CatalogSync/Program.cs @@ -21,12 +21,6 @@ public enum DataProvider public class Options { - [Option(shortName: 'u', longName: "user", HelpText = "MyPurdue User Name")] - public string MyPurdueUser { get; set; } - - [Option(shortName: 'p', longName: "pass", HelpText = "MyPurdue Password")] - public string MyPurduePass { get; set; } - [Option(shortName: 'd', longName: "data-provider", Default = DataProvider.Sqlite, HelpText = "The database provider to use")] public DataProvider DataProvider { get; set; } @@ -58,19 +52,6 @@ static async Task Main(string[] args) static async Task RunASync(Options options) { - string username = options.MyPurdueUser ?? - Environment.GetEnvironmentVariable("MY_PURDUE_USERNAME"); - string password = options.MyPurduePass ?? - Environment.GetEnvironmentVariable("MY_PURDUE_PASSWORD"); - - if ((username == null) || (password == null)) - { - Console.Error.WriteLine("You must provide a MyPurdue username and password " + - "to sync course data. Use command line options or environment variables " + - "MY_PURDUE_USERNAME and MY_PURDUE_PASSWORD."); - return; - } - var loggerFactory = LoggerFactory.Create(b => b.AddSimpleConsole(c => c.TimestampFormat = "hh:mm:ss.fff ")); @@ -85,7 +66,7 @@ static async Task RunASync(Options options) var behavior = options.SyncAllTerms ? TermSyncBehavior.SyncAllTerms : TermSyncBehavior.SyncNewAndCurrentTerms; - var connection = await MyPurdueConnection.CreateAndConnectAsync(username, password, + var connection = new MyPurdueConnection( loggerFactory.CreateLogger()); var scraper = new MyPurdueScraper(connection, loggerFactory.CreateLogger()); diff --git a/src/Scraper/Connections/IMyPurdueConnection.cs b/src/Scraper/Connections/IMyPurdueConnection.cs index 0d9b28f..6b45c48 100644 --- a/src/Scraper/Connections/IMyPurdueConnection.cs +++ b/src/Scraper/Connections/IMyPurdueConnection.cs @@ -14,9 +14,5 @@ public interface IMyPurdueConnection // Retrieves the contents of bwckschd.p_get_crse_unsec from MyPurdue for the given term // and subject Task GetSectionListPageAsync(string termCode, string subjectCode); - - // Retrieves the contents of bwskfcls.P_GetCrse_Advanced from MyPurdue for the given term - // and subject - Task GetSectionDetailsPageAsync(string termCode, string subjectCode); } } \ No newline at end of file diff --git a/src/Scraper/Connections/MyPurdueConnection.cs b/src/Scraper/Connections/MyPurdueConnection.cs index 714850f..ef424e5 100644 --- a/src/Scraper/Connections/MyPurdueConnection.cs +++ b/src/Scraper/Connections/MyPurdueConnection.cs @@ -16,46 +16,41 @@ public class MyPurdueConnection : IMyPurdueConnection // Enum value of implemented HTTP request methods private enum HttpMethod { GET, POST }; - // Username used to authenticate with MyPurdue - private readonly string username; - - // Password used to authenticate with MyPurdue - private readonly string password; - // Logger reference private readonly ILogger logger; // Keeps track of cookies for all requests on this connection - private CookieContainer cookies = new CookieContainer(); + private readonly CookieContainer cookies = new(); // Keeps track of the last page requested, used in 'Referrer' HTTP header private string referrer = ""; // HttpClient used by this connection to communicate with MyPurdue - private HttpClient httpClient; + private readonly HttpClient httpClient; // How many request attempts should be made before failure private const int MAX_RETRIES = 5; - // Used to parse session timeout messages - private static Regex sessionTimeoutRegex = new Regex(@"Session timeout occurred", - RegexOptions.IgnoreCase); - - // Attempts to open a new authenticated connection to MyPurdue, - // throws if authentication fails - public static async Task CreateAndConnectAsync(string username, - string password, ILogger logger) + public MyPurdueConnection(ILogger logger) { - var connection = new MyPurdueConnection(username, password, logger); - if (await connection.Authenticate()) - { - return connection; - } - else + this.logger = logger; + + var httpHandler = new HttpClientHandler() { - throw new ApplicationException( - "Could not authenticate to MyPurdue with supplied username and password."); - } + CookieContainer = cookies, // MyPurdue stores a lot of state in cookies - we need + // to persist them to avoid upsetting it + AllowAutoRedirect = false, // We'll handle redirects by ourselves + }; + httpClient = new HttpClient(httpHandler as HttpMessageHandler); + + // Pretend we're Chrome + httpClient.DefaultRequestHeaders.Add("Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + httpClient.DefaultRequestHeaders.Add("Accept-Language", "en-US,en;q=0.5"); + httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive"); + httpClient.DefaultRequestHeaders.Add("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + + "Chrome/45.0.2454.99 Safari/537.36"); } public async Task GetTermListPageAsync() @@ -64,19 +59,16 @@ public async Task GetTermListPageAsync() { var result = await Request(HttpMethod.GET, "https://selfservice.mypurdue.purdue.edu/prod/bwckschd.p_disp_dyn_sched"); - var content = await result.Content.ReadAsStringAsync(); - - if (sessionTimeoutRegex.IsMatch(content)) + if (!result.IsSuccessStatusCode) { - // If we received a session timeout message, authenticate and then try again - await Authenticate(); + this.logger.LogError("Received non-success status code '{}' on " + + "GetTermListPageAsync.", result.StatusCode); } else { - return content; + return await result.Content.ReadAsStringAsync(); } } - throw new ApplicationException( "Exceeded retries attempting to query MyPurdue term list"); } @@ -95,15 +87,14 @@ public async Task GetSubjectListPageAsync(string termCode) }), true, "https://selfservice.mypurdue.purdue.edu/prod/bwckschd.p_disp_dyn_sched"); - var content = await request.Content.ReadAsStringAsync(); - - if (sessionTimeoutRegex.IsMatch(content)) + if (!request.IsSuccessStatusCode) { - // If we received a session timeout message, authenticate and then try again - await Authenticate(); + this.logger.LogError("Received non-success status code '{}' on " + + "GetSubjectListPageAsync.", request.StatusCode); } else { + var content = await request.Content.ReadAsStringAsync(); return content; } } @@ -150,16 +141,15 @@ public async Task GetSectionListPageAsync(string termCode, string subjec var request = await Request(HttpMethod.POST, "https://selfservice.mypurdue.purdue.edu/prod/bwckschd.p_get_crse_unsec", postBody); - var content = await request.Content.ReadAsStringAsync(); - if (sessionTimeoutRegex.IsMatch(content)) + if (!request.IsSuccessStatusCode) { - // If we received a session timeout message, authenticate and then try again - await Authenticate(); + this.logger.LogError("Received non-success status code '{}' on " + + "GetSectionListPageAsync.", request.StatusCode); } else { - return content; + return await request.Content.ReadAsStringAsync(); } } throw new ApplicationException( @@ -212,103 +202,33 @@ public async Task GetSectionDetailsPageAsync(string termCode, string sub postBody, true, "https://selfservice.mypurdue.purdue.edu/prod/bwskfcls.P_GetCrse"); - var content = await request.Content.ReadAsStringAsync(); - - if (sessionTimeoutRegex.IsMatch(content)) + if (!request.IsSuccessStatusCode) { - // If we received a session timeout message, authenticate and then try again - await Authenticate(); + this.logger.LogError("Received non-success status code '{}' on " + + "GetSectionListPageAsync.", request.StatusCode); } else { - return content; + return await request.Content.ReadAsStringAsync(); } } throw new ApplicationException( "Exceeded retries attempting to query MyPurdue section details"); } - private MyPurdueConnection(string username, string password, - ILogger logger) - { - this.username = username; - this.password = password; - this.logger = logger; - - var httpHandler = new HttpClientHandler() - { - CookieContainer = cookies, // MyPurdue stores a lot of state in cookies - we need - // to persist them to avoid upsetting it - AllowAutoRedirect = false, // We'll handle redirects by ourselves - }; - httpClient = new HttpClient(httpHandler as HttpMessageHandler); - - // Pretend we're Chrome - httpClient.DefaultRequestHeaders.Add("Accept", - "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); - httpClient.DefaultRequestHeaders.Add("Accept-Language", "en-US,en;q=0.5"); - httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive"); - httpClient.DefaultRequestHeaders.Add("User-Agent", - "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " + - "Chrome/45.0.2454.99 Safari/537.36"); - } - - private async Task Authenticate() - { - var loginForm = await Request(HttpMethod.GET, - "https://www.purdue.edu/apps/account/cas/login" + - "?service=https%3A%2F%2Fwl.mypurdue.purdue.edu%2Fc%2Fportal%2Flogin"); - HtmlDocument document = new HtmlDocument(); - document.LoadHtml(await loginForm.Content.ReadAsStringAsync()); - HtmlNode docRoot = document.DocumentNode; - var ltValue = docRoot - .SelectSingleNode("//input[@name='lt']") - .GetAttributeValue("value", ""); - - FormUrlEncodedContent content = new FormUrlEncodedContent(new[] - { - new KeyValuePair("username", username), - new KeyValuePair("password", password), - new KeyValuePair("lt", ltValue), - new KeyValuePair("execution", "e1s1"), - new KeyValuePair("_eventId", "submit"), - new KeyValuePair("submit", "Login") - }); - - HttpResponseMessage r = await Request(HttpMethod.POST, - "https://www.purdue.edu/apps/account/cas/login" + - "?service=https%3A%2F%2Fwl.mypurdue.purdue.edu%2Fc%2Fportal%2Flogin", content); - string result = await r.Content.ReadAsStringAsync(); - bool isAuthenticated = !result.Contains("Authentication failed."); - - if (!isAuthenticated) - { - return false; - } - - // Authenticate with MyPurdue self-service - var ssResult = await Request(HttpMethod.GET, - "https://wl.mypurdue.purdue.edu/static_resources/portal/jsp/ss_redir_lp5.jsp?pg=23", - null, true, "https://wl.mypurdue.purdue.edu/"); - var ssResultContent = await ssResult.Content.ReadAsStringAsync(); - // TODO: Verify self-service login - - return isAuthenticated; - } - // Generates a basic request, providing POST body, following redirects, and storing cookies. private async Task Request(HttpMethod method, string url, FormUrlEncodedContent postContent = null, bool followRedirect = true, string requestReferrer = null) { - logger.LogDebug($"{method.ToString()} Request: {url}"); + logger.LogDebug("{} Request: {}", method.ToString(), url); if (requestReferrer != null) { referrer = requestReferrer; } - logger.LogDebug($"Referrer: {referrer}"); + logger.LogDebug("Referrer: {}", referrer); if (referrer.Length > 0) { httpClient.DefaultRequestHeaders.Referrer = new Uri(referrer); @@ -326,13 +246,13 @@ private async Task Request(HttpMethod method, string url, { var postString = await postContent.ReadAsStringAsync(); postString = postString.Replace("&", "\n\t\t"); - logger.LogDebug($"POST data: \n{postString}"); + logger.LogDebug("POST data: \n{}", postString); } // Print out all the cookies we're sending var cookiesToSend = cookies.GetCookies(new Uri(url)); - logger.LogDebug("Outgoing cookies: \n" + - $"{string.Join("\n", cookiesToSend.Select(c => c.ToString()))}"); + logger.LogDebug("Outgoing cookies: \n{}", + string.Join("\n", cookiesToSend.Select(c => c.ToString()))); HttpResponseMessage result = null; for (int attempts = 0; attempts < MAX_RETRIES; ++attempts) @@ -351,8 +271,8 @@ private async Task Request(HttpMethod method, string url, } catch (Exception e) { - logger.LogWarning("HTTP request exception, retrying " + - $"({attempts + 1} / {MAX_RETRIES})\n{e.ToString()}"); + logger.LogWarning("HTTP request exception, retrying ({} / {})\n{}", + (attempts + 1), MAX_RETRIES, e.ToString()); continue; } break; @@ -363,19 +283,20 @@ private async Task Request(HttpMethod method, string url, "No request was made - most likely due to invalid HTTP method."); } - IEnumerable incomingCookies; - if (result.Headers.TryGetValues("set-cookie", out incomingCookies)) + if (result.Headers.TryGetValues("set-cookie", out IEnumerable incomingCookies)) { - logger.LogDebug($"Incoming cookies:\n{string.Join("\n", incomingCookies)}"); + logger.LogDebug("Incoming cookies:\n{}", string.Join("\n", incomingCookies)); foreach (var c in incomingCookies) { if (c.StartsWith("SESSID") && !c.Contains("expires")) { var sessidCookieValue = c.Substring((c.IndexOf('=') + 1), (c.IndexOf(';') - c.IndexOf('=') - 1)); - var sessidCookie = new Cookie("SESSID", sessidCookieValue); - sessidCookie.Domain = new Uri(url).Host; - sessidCookie.Path = "/"; + var sessidCookie = new Cookie("SESSID", sessidCookieValue) + { + Domain = new Uri(url).Host, + Path = "/" + }; cookies.Add(sessidCookie); } cookies.SetCookies(new Uri(url), c); @@ -384,7 +305,7 @@ private async Task Request(HttpMethod method, string url, if (followRedirect && (result.Headers.Location != null)) { - logger.LogDebug($"Redirect to {result.Headers.Location.ToString()}"); + logger.LogDebug("Redirect to {}", result.Headers.Location.ToString()); // All redirects are converted to GET result = await Request(HttpMethod.GET, result.Headers.Location.ToString(), null); } diff --git a/src/Scraper/MyPurdueScraper.cs b/src/Scraper/MyPurdueScraper.cs index c2feef5..1f07801 100644 --- a/src/Scraper/MyPurdueScraper.cs +++ b/src/Scraper/MyPurdueScraper.cs @@ -63,7 +63,7 @@ public async Task> GetSubjectsAsync(string termCode) { string subjectListPageContent = await connection.GetSubjectListPageAsync(termCode); var subjects = new List(); - HtmlDocument document = new HtmlDocument(); + HtmlDocument document = new(); document.LoadHtml(subjectListPageContent); HtmlNode root = document.DocumentNode; HtmlNodeCollection termSelectNodes = @@ -72,7 +72,7 @@ public async Task> GetSubjectsAsync(string termCode) { var code = HtmlEntity.DeEntitize(node.Attributes["VALUE"].Value).Trim(); var name = HtmlEntity.DeEntitize(node.InnerText).Trim(); - name = name.Substring(name.IndexOf("-")+1); + name = name.Substring(name.IndexOf("-") + 1); subjects.Add(new Subject() { Code = code, @@ -85,75 +85,87 @@ public async Task> GetSubjectsAsync(string termCode) public async Task> GetSectionsAsync(string termCode, string subjectCode) { - // The section information we need from MyPurdue is split between two pages: - // the "section list" page (bwckschd.p_get_crse_unsec) and the "section details" page - // (bwskfcls.P_GetCrse_Advanced). + // The section information we need from MyPurdue may be split across + // multiple pages. // - // This method scrapes both the section list page and the section details page, - // then merges the relevant information from both into one coherent model. + // This method scrapes the relevant information from all sources into + // one coherent model. Dictionary sectionList = await FetchSectionListAsync(termCode, subjectCode); - Dictionary sectionDetails = - await FetchSectionDetailsAsync(termCode, subjectCode); - var mergedSections = new List
(); foreach (var sectionPair in sectionList) { Crn sectionCrn = sectionPair.Key; SectionListInfo sectionListInfo = sectionPair.Value; - if (!sectionDetails.ContainsKey(sectionCrn)) - { - throw new ApplicationException($"Section list retrieved from MyPurdue " + - $"contained CRN {sectionCrn} that was not found on section details page"); - } - SectionDetailsInfo sectionDetailsInfo = sectionDetails[sectionCrn]; // Merge meeting info var mergedSectionMeetings = new List(); for (int i = 0; i < sectionListInfo.Meetings.Count; ++i) { var sectionListInfoMeeting = sectionListInfo.Meetings[i]; - var sectionDetailsInfoMeeting = sectionDetailsInfo.Meetings[i]; + if (!BuildingNamesToShortCodes.TryGetValue(sectionListInfoMeeting.BuildingName, + out string buildingShortCode)) + { + throw new ApplicationException( + "No building short code found for building " + + $"'{sectionListInfoMeeting.BuildingName}'"); + } mergedSectionMeetings.Add(new Meeting() { - Type = sectionDetailsInfoMeeting.Type, + Type = sectionListInfoMeeting.Type, Instructors = sectionListInfoMeeting.Instructors, StartDate = sectionListInfoMeeting.StartDate, EndDate = sectionListInfoMeeting.EndDate, DaysOfWeek = sectionListInfoMeeting.DaysOfWeek, StartTime = sectionListInfoMeeting.StartTime, EndTime = sectionListInfoMeeting.EndTime, - BuildingCode = sectionDetailsInfoMeeting.BuildingCode, + BuildingCode = buildingShortCode, BuildingName = sectionListInfoMeeting.BuildingName, RoomNumber = sectionListInfoMeeting.RoomNumber, }); } + string sectionType = sectionListInfo.Meetings.Select(m => m.Type) + .FirstOrDefault(""); + + if (!CampusNamesToShortCodes.TryGetValue(sectionListInfo.CampusName, + out string campusShortCode)) + { + throw new ApplicationException( + "No campus short code found for campus " + + $"'{sectionListInfo.CampusName}'"); + } + // Merge section info mergedSections.Add(new Section() { Crn = sectionCrn, - SectionCode = sectionDetailsInfo.SectionCode, + SectionCode = sectionListInfo.SectionCode, Meetings = mergedSectionMeetings.ToArray(), SubjectCode = sectionListInfo.SubjectCode, CourseNumber = sectionListInfo.CourseNumber, - Type = sectionDetailsInfo.Type, + Type = sectionType, CourseTitle = sectionListInfo.CourseTitle, Description = sectionListInfo.Description, CreditHours = sectionListInfo.CreditHours, LinkSelf = sectionListInfo.LinkSelf, LinkOther = sectionListInfo.LinkOther, - CampusCode = sectionDetailsInfo.CampusCode, + CampusCode = campusShortCode, CampusName = sectionListInfo.CampusName, - Capacity = sectionDetailsInfo.Capacity, - Enrolled = sectionDetailsInfo.Enrolled, - RemainingSpace = sectionDetailsInfo.RemainingSpace, - WaitListCapacity = sectionDetailsInfo.WaitListCapacity, - WaitListCount = sectionDetailsInfo.WaitListCount, - WaitListSpace = sectionDetailsInfo.WaitListSpace, + + // The loss of authenticated APIs removed our source of information for + // capacity, enrolled, remaining space, and waitlists without querying + // each CRN individually. + // Tracked here: https://github.com/Purdue-io/PurdueApi/issues/56 + Capacity = 0, + Enrolled = 0, + RemainingSpace = 0, + WaitListCapacity = 0, + WaitListCount = 0, + WaitListSpace = 0, }); } @@ -178,7 +190,7 @@ private async Task> FetchSectionListAsync(strin } // Parse HTML from the returned page - HtmlDocument htmlDocument = new HtmlDocument(); + HtmlDocument htmlDocument = new(); htmlDocument.LoadHtml(sectionListPageContent); HtmlNode docRoot = htmlDocument.DocumentNode; @@ -208,6 +220,7 @@ private async Task> FetchSectionListAsync(strin string parsedCrn = titleParse.Groups["crn"].Value; string parsedSubjectCode = titleParse.Groups["subj"].Value; string parsedCourseNumber = titleParse.Groups["number"].Value; + string parsedSectionCode = titleParse.Groups["section"].Value; string parsedLinkSelf = titleParse.Groups["selflink"].Value; string parsedLinkOther = titleParse.Groups["otherlink"].Value; @@ -216,21 +229,29 @@ private async Task> FetchSectionListAsync(strin double parsedCreditHours = 0; HtmlNode info = termSelectNodes[i + 1].SelectSingleNode("td"); // TODO: Deal with white space... - string parsedDescription = HtmlEntity.DeEntitize(info.FirstChild.InnerText).Trim(); + string parsedDescription = ""; + if (info.FirstChild.NodeType == HtmlNodeType.Text) + { + parsedDescription = HtmlEntity.DeEntitize(info.FirstChild.InnerText).Trim(); + } HtmlNode additionalInfo = info .SelectSingleNode("span[@class='fieldlabeltext'][last()]") ?.NextSibling?.NextSibling; while (additionalInfo != null) { - if (additionalInfo.InnerText.Contains("Campus")) + if (additionalInfo.NodeType == HtmlNodeType.Text) { - parsedCampusName = HtmlEntity.DeEntitize(additionalInfo.InnerText.Trim()); - } - if (additionalInfo.InnerText.Contains("Credits")) - { - parsedCreditHours = double.Parse( - HtmlEntity.DeEntitize(additionalInfo.InnerText.Trim()).Split( - new string[] { " " }, StringSplitOptions.None)[0]); + if (additionalInfo.InnerText.Contains("Campus")) + { + parsedCampusName = HtmlEntity.DeEntitize( + additionalInfo.InnerText.Trim()); + } + if (additionalInfo.InnerText.Contains("Credits")) + { + parsedCreditHours = double.Parse( + HtmlEntity.DeEntitize(additionalInfo.InnerText.Trim()).Split( + new string[] { " " }, StringSplitOptions.None)[0]); + } } additionalInfo = additionalInfo.NextSibling; } @@ -259,6 +280,8 @@ private async Task> FetchSectionListAsync(strin ParsingUtilities.ParseDaysOfWeek(daysOfWeek); // Parse building / room + // Usually the room number is the last token in the string. + // However, there are a lot of edge cases string parsedMeetingRoomNumber = ""; string parsedMeetingBuildingName = ""; string parsedMeetingBuildingCode = ""; @@ -270,12 +293,45 @@ private async Task> FetchSectionListAsync(strin parsedMeetingBuildingName = "TBA"; parsedMeetingBuildingCode = "TBA"; } + // Handle weird on-site location names (ex. "On-site SF 1900W") + else if (room.StartsWith("On-site") && (room.Length > 7)) + { + parsedMeetingBuildingName = room.Substring(0, 7); + parsedMeetingRoomNumber = room.Substring(8); + } + // Some buildings (Studebaker Building in South Bend) have a weird + // room naming convention (ex. "Studebaker Building CLASS 2" or + // "Studebaker Building LAB 4") + else if (room.Contains(" CLASS") || room.Contains(" LAB") || + room.Contains(" Room") || room.Contains(" AU HARTUNG")) + { + var splitIndex = 0; + if (room.Contains(" CLASS")) + { + splitIndex = room.LastIndexOf(" CLASS"); + } + else if (room.Contains(" LAB")) + { + splitIndex = room.LastIndexOf(" LAB"); + } + else if (room.Contains(" Room")) + { + splitIndex = room.LastIndexOf(" Room"); + } + else if (room.Contains(" AU HARTUNG")) + { + splitIndex = room.LastIndexOf(" AU HARTUNG"); + } + parsedMeetingBuildingName = room.Substring(0, splitIndex); + parsedMeetingRoomNumber = room.Substring(splitIndex + 1); + } + // Otherwise we just assume the last space separates the building name + // from the room number. ex. "Lawson Computer Science Bldg B131" else { var index = room.LastIndexOf(" "); parsedMeetingBuildingName = room.Substring(0, index); - parsedMeetingRoomNumber = - room.Substring(index + 1,room.Length - index - 1); + parsedMeetingRoomNumber = room.Substring(index + 1); } // Parse dates @@ -331,6 +387,7 @@ private async Task> FetchSectionListAsync(strin SubjectCode = parsedSubjectCode, CourseNumber = parsedCourseNumber, CourseTitle = parsedTitle, + SectionCode = parsedSectionCode, Description = parsedDescription, CreditHours = parsedCreditHours, LinkSelf = parsedLinkSelf, @@ -344,162 +401,6 @@ private async Task> FetchSectionListAsync(strin return parsedSections; } - private async Task> FetchSectionDetailsAsync( - string termCode, string subjectCode) - { - var parsedSections = new Dictionary(); - - string sectionDetailsPageContent = - await connection.GetSectionDetailsPageAsync(termCode, subjectCode); - - // Check if we didn't return any classes - // TODO: Might be a significant perf hit - we can probably avoid searching for this - // string if the page is large enough that there are likely results. - if (sectionDetailsPageContent.Contains( - "No classes were found that meet your search criteria")) - { - return parsedSections; - } - - // Parse HTML from the returned page - HtmlDocument htmlDocument = new HtmlDocument(); - htmlDocument.LoadHtml(sectionDetailsPageContent); - HtmlNode docRoot = htmlDocument.DocumentNode; - HtmlNodeCollection sectionNodes = docRoot.SelectNodes( - "/html/body/div[@class='pagebodydiv'][1]//" + - "table[@class='datadisplaytable'][1]/tr[ not ( th ) ]"); - if (sectionNodes == null) - { - throw new ApplicationException("Could not parse data from section details page."); - } - - // Loop through table rows - SectionDetailsInfo section = null; - for (var i = 0; i < sectionNodes.Count; i++) - { - var node = sectionNodes[i]; - var crnNode = node.SelectSingleNode("td[2]"); - if (crnNode == null) - { - continue; // No node? Skip... - } - - // Each row is a section AND/OR meeting. - // If there's a CRN in this row, it means that we're looking at a new section. - if (HtmlEntity.DeEntitize(crnNode.InnerText).Trim().Length > 0) - { - // Section w/ primary meeting data - var crnNumber = HtmlEntity.DeEntitize(crnNode.InnerText).Trim(); - // Deal with credit hours... - var credits = HtmlEntity.DeEntitize(node.SelectSingleNode("td[7]").InnerText) - .Trim(); - if (credits.Contains("-")) { - credits = credits.Substring(credits.IndexOf("-")+1); - } - else if (credits.Contains("/")) - { - credits = credits.Substring(credits.IndexOf("/") + 1); - } - section = new SectionDetailsInfo() - { - Crn = crnNumber, - SectionCode = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[5]").InnerText).Trim(), - Meetings = new List(), - SubjectCode = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[3]").InnerText).Trim(), - CourseNumber = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[4]").InnerText).Trim(), - Type = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[23]").InnerText).Trim(), - CourseTitle = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[8]").InnerText).Trim(), - Description = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[26]").InnerText).Trim(), - CreditHours = double.Parse(credits), - CampusCode = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[6]").InnerText).Trim(), - Capacity = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[11]").InnerText).Trim()), - Enrolled = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[12]").InnerText).Trim()), - RemainingSpace = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[13]").InnerText).Trim()), - WaitListCapacity = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[14]").InnerText).Trim()), - WaitListCount = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[15]").InnerText).Trim()), - WaitListSpace = Int32.Parse(HtmlEntity.DeEntitize( - node.SelectSingleNode("td[16]").InnerText).Trim()), - }; - - parsedSections.Add(crnNumber, section); - } - - // Now, update meeting data for this row - // Update meeting days of the week - // Parse days of week - var daysOfWeek = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[9]").InnerText).Trim(); - DaysOfWeek parsedMeetingDaysOfWeek = ParsingUtilities.ParseDaysOfWeek(daysOfWeek); - - // Parse times - var times = HtmlEntity.DeEntitize(node.SelectSingleNode("td[10]").InnerText).Trim(); - // TODO: Don't hard-code time zone - var startEndTimes = ParsingUtilities.ParseStartEndTime(times, - defaultTimeZone); - DateTimeOffset parsedMeetingStartTime = startEndTimes.Item1; - DateTimeOffset parsedMeetingEndTime = startEndTimes.Item2; - - // Update meeting location (building short name) - var loc = HtmlEntity.DeEntitize(node.SelectSingleNode("td[22]").InnerText).Trim(); - string parsedMeetingBuildingCode = ""; - string parsedMeetingBuildingName = ""; - string parsedMeetingRoomNumber = ""; - if (loc.Equals("TBA")) - { - parsedMeetingBuildingCode = "TBA"; - parsedMeetingBuildingName = "TBA"; - parsedMeetingRoomNumber = "TBA"; - } - else if (loc.Length > 0) - { - if (loc.Contains(" ")) - { - parsedMeetingBuildingCode = loc.Substring(0, loc.IndexOf(" ")).Trim(); - parsedMeetingRoomNumber = loc.Substring(loc.IndexOf(" ") + 1).Trim(); - } - else - { - parsedMeetingBuildingCode = loc; - parsedMeetingRoomNumber = ""; - } - } else - { - throw new ApplicationException( - $"Could not parse location data for section CRN {section.Crn}."); - } - - // Updating meeting type - string parsedMeetingType = HtmlEntity.DeEntitize( - node.SelectSingleNode("td[23]").InnerText).Trim(); - - // Add the meeting - section.Meetings.Add(new SectionDetailsMeetingInfo() - { - Type = parsedMeetingType, - DaysOfWeek = parsedMeetingDaysOfWeek, - StartTime = parsedMeetingStartTime, - EndTime = parsedMeetingEndTime, - BuildingCode = parsedMeetingBuildingCode, - BuildingName = parsedMeetingBuildingName, - RoomNumber = parsedMeetingRoomNumber, - }); - } - - return parsedSections; - } - // A subset of Section information that can be scraped from the section list page. private record SectionListInfo { @@ -513,6 +414,8 @@ private record SectionListInfo public string CourseTitle { get; init; } + public string SectionCode { get; init; } + public string Description { get; init; } public double CreditHours { get; init; } @@ -524,58 +427,194 @@ private record SectionListInfo public string CampusName { get; init; } } - // A subset of Section information that can be scraped from the section details page. - private record SectionDetailsInfo + // The loss of authenticated APIs removed our source of information for + // campus short codes, so now they are hard-coded here and we just hope + // new campuses are a fairly rare occurrence. + // Tracked here: https://github.com/Purdue-io/PurdueApi/issues/55 + private static readonly Dictionary CampusNamesToShortCodes = new() { - public Crn Crn { get; init; } - - public string SectionCode { get; init; } - - public IList Meetings { get; init; } - - public string SubjectCode { get; init; } - - public string CourseNumber { get; init; } - - public string Type { get; init; } - - public string CourseTitle { get; init; } - - public string Description { get; init; } - - public double CreditHours { get; init; } - - public string CampusCode { get; init; } - - public int Capacity { get; init; } - - public int Enrolled { get; init; } - - public int RemainingSpace { get; init; } - - public int WaitListCapacity { get; init; } - - public int WaitListCount { get; init; } - - public int WaitListSpace { get; init; } - } - - // A subset of Meeting information that can be scraped from the section details page - private record SectionDetailsMeetingInfo + { "West Lafayette Campus", "PWL" }, + { "West Lafayette Continuing Ed Campus", "CEC" }, + { "IUPUI Campus", "PIU" }, + { "New Albany Campus", "TNA" }, + { "Richmond Campus", "TRI" }, + { "Lafayette Campus", "TLF" }, + { "Anderson Campus", "TAN" }, + { "South Bend Campus", "TSB" }, + { "Columbus Campus", "TCO" }, + { "Indianapolis Campus", "TDY" }, + { "Kokomo Campus", "TKO" }, + { "Vincennes Campus", "TVN" }, + { "Greensburg Campus", "TGB" }, + { "Concurrent Credit Campus", "CC" }, + { "Dual Campus Campus", "TDC" }, + }; + + // The loss of authenticated APIs removed our source of information for + // building short codes, so now they are hard-coded here and we just hope + // new buildings are a fairly rare occurrence. + // Tracked here: https://github.com/Purdue-io/PurdueApi/issues/54 + private static readonly Dictionary BuildingNamesToShortCodes = new() { - public string Type { get; init; } - - public DaysOfWeek DaysOfWeek { get; init; } - - public DateTimeOffset StartTime { get; init; } - - public DateTimeOffset EndTime { get; init; } - - public string BuildingCode { get; init; } - - public string BuildingName { get; init; } - - public string RoomNumber { get; init; } - } + { "Asynchronous Online Learning", "ASYNC" }, + { "TBA", "TBA" }, + { "Chaffee Hall", "CHAF" }, + { "Seng-Liang Wang Hall", "WANG" }, + { "Lawson Computer Science Bldg", "LWSN" }, + { "Synchronous Online Learning", "SYNC" }, + { "Forney Hall of Chemical Engr", "FRNY" }, + { "Grissom Hall", "GRIS" }, + { "Knoy Hall of Technology", "KNOY" }, + { "Mathematical Sciences Building", "MATH" }, + { "Electrical Engineering Bldg", "EE" }, + { "Stewart Center", "STEW" }, + { "Materials and Electrical Engr", "MSEE" }, + { "Wilmeth Active Learning Center", "WALC" }, + { "Stanley Coulter Hall", "SC" }, + { "Honors College&Resid North", "HCRN" }, + { "Physics Building", "PHYS" }, + { "Brown Laboratory of Chemistry", "BRWN" }, + { "Wetherill Lab of Chemistry", "WTHR" }, + { "Hampton Hall of Civil Engnrng", "HAMP" }, + { "Neil Armstrong Hall of Engr", "ARMS" }, + { "Recitation Building", "REC" }, + { "Beering Hall of Lib Arts & Ed", "BRNG" }, + { "Lilly Hall of Life Sciences", "LILY" }, + { "ADM Agricultural Innovation Ct", "ADM" }, + { "Lyles-Porter Hall", "LYLE" }, + { "Agricultural & Biological Engr", "ABE" }, + { "Hicks Undergraduate Library", "HIKS" }, + { "Forest Products Building", "FPRD" }, + { "Pao Hall of Visual & Perf Arts", "PAO" }, + { "Nelson Hall of Food Science", "NLSN" }, + { "Forestry Building", "FORS" }, + { "Armory", "AR" }, + { "Biochemistry Building", "BCHM" }, + { "Class of 1950 Lecture Hall", "CL50" }, + { "Smith Hall", "SMTH" }, + { "Jerry S Rawls Hall", "RAWL" }, + { "Krannert Building", "KRAN" }, + { "Horticulture Building", "HORT" }, + { "On-site", "OFFCMP" }, + { "Daniel Turfgrass Rsch&Diag Ct", "DANL" }, + { "Heavilon Hall", "HEAV" }, + { "University Hall", "UNIV" }, + { "Land O Lakes Ctr", "LOLC" }, + { "Matthews Hall", "MTHW" }, + { "Creighton Hall of Animal Sci", "CRTN" }, + { "Jischke Hall of Biomedical Eng", "MJIS" }, + { "Peirce Hall", "PRCE" }, + { "Winthrop E. Stone Hall", "STON" }, + { "Holleman-Niswonger Simultr Ctr", "SIML" }, + { "Niswonger Aviation Tech Bldg", "NISW" }, + { "Terminal Building (Hangar 2)", "TERM" }, + { "Indiana Manufcturing Institute", "IMI" }, + { "Aerospace Science Lab-Hangar 3", "AERO" }, + { "Composites Laboratory", "COMP" }, + { "Slayter Ctr of Performing Arts", "SCPA" }, + { "Elliott Hall of Music", "ELLT" }, + { "Chaney-Hale Hall of Science", "CHAS" }, + { "Morgan Ctr for Entrepreneurshp", "MRGN" }, + { "Lynn Hall of Vet Medicine", "LYNN" }, + { "University Church", "UC" }, + { "Robert Heine Pharmacy Building", "RHPH" }, + { "Mechanical Engineering Bldg", "ME" }, + { "Vet Pathobiology Research Bldg", "VPRB" }, + { "Veterinary Pathology Building", "VPTH" }, + { "Psychological Sciences Bldg", "PSYC" }, + { "Felix Haas Hall", "HAAS" }, + { "Marriott Hall", "MRRT" }, + { "Potter Engineering Center", "POTR" }, + { "Lambert Field House & Gym", "LAMB" }, + { "Brees Student-Athlete Acad Ctr", "BRES" }, + { "Krach Leadership Center", "KRCH" }, + { "Ernest C. Young Hall", "YONG" }, + { "Eleanor B Shreve Residence Hal", "SHRV" }, + { "John S. Wright Forestry Center", "WRIT" }, + { "Pfendler Hall of Agriculture", "PFEN" }, + { "Honors College&Resid South", "HCRS" }, + { "Griffin Residence Hall South", "GRFS" }, + { "Fowler Memorial House", "FWLR" }, + { "Bill and Sally Hanley Hall", "HNLY" }, + { "Horticultural Greenhouse", "HGRH" }, + { "State Farm", "SF" }, + { "Johnson Hall of Nursing", "JNSN" }, + { "Schwartz Tennis Center", "SCHW" }, + { "Purdue Memorial Union", "PMU" }, + { "Hillenbrand Residence Hall", "HILL" }, + { "Equine Health Sciences Annex", "EHSA" }, + { "Equine Health Sciences Bldg", "EHSB" }, + { "Online", "ONLINE" }, + { "Hockmeyer Hall Strc Bio", "HOCK" }, + { "The Innovation Center", "INVC" }, + { "Whistler Hall of Ag Research", "WSLR" }, + { "Training & Reception Center ROOM", "TRC" }, + { "Purdue Technology Center", "SEI" }, + { "Technology Statewide Site", "TECHSW" }, + { "Agricultural Administration", "AGAD" }, + { "Drug Discovery Bldg", "DRUG" }, + { "Bechtel Innovation Design Ctr", "BIDC" }, + { "Michael Golden Labs and Shops", "MGL" }, + { "Tom Spurgeon Golf Training Ctr", "SPUR" }, + { "Ground Service Building", "GRS" }, + { "Third Street Suites", "TSS" }, + { "Nuclear Engineering Building", "NUCL" }, + { "Kepner Hall", "KPNR" }, + { "Purdue Memorial Union Club", "PMUC" }, + { "Cordova Rec Sports Center", "CREC" }, + { "Herrick Laboratories", "HLAB" }, + { "Animal Sciences Teaching Lab", "ASTL" }, + { "Cancelled", "CANCEL" }, + { "Ross-Ade Stadium", "STDM" }, + { "Discovery and Learning", "DLR" }, + { "On-site ONLINE", "OFFCMP" }, + { "Black Cultural Center", "BCC" }, + { "Subaru Isuzu Automotive", "SIA" }, + { "Hansen Life Sciences Research", "HANS" }, + { "Animal Disease Diagnostic Lab", "ADDL" }, + { "Guy J Mackey Arena", "MACK" }, + { "Ray W Herrick Laboratory", "HERL" }, + { "Harvey W. Wiley Residence Hall", "WILY" }, + { "Boilermaker Aquatic Center", "AQUA" }, + { "INOK Investments Warehouse", "INOK" }, + { "High Pressure Research Lab", "ZL3" }, + { "Engineering Administration", "ENAD" }, + { "Bindley Bioscience Center", "BIND" }, + { "Krannert Center", "KCTR" }, + { "South Campus Courts Bldg B", "SCCB" }, + { "Civil Engineering Building", "CIVL" }, + { "Recreational Sports Center", "RSC" }, + { "Service Building", "SERV" }, + { "Child Developmt&Family Studies", "CDFS" }, + { "Food Science Building", "FS" }, + { "Birk Nanotechnology Center", "BRK" }, + { "Max W & Maileen Brown Hall", "BHEE" }, + { "Marc & Sharon Hagle Hall", "HAGL" }, + { "Inventrek Technology Park", "INVTRK" }, + { "Advanced Manufacturing Center", "AMCE" }, + { "Learning Center", "LC" }, + { "Purdue Polytechnic Anderson", "PPA" }, + { "Flex Lab", "FLEX" }, + { "Main Street Resource Center", "MSRC" }, + { "Studebaker Building", "SBST" }, + { "Asian Amer & Asian Cult Ctr", "AACC" }, + { "Dudley Hall", "DUDL" }, + { "Gerald D and Edna E Mann Hall", "MANN" }, + { "Lambertus Hall", "LMBS" }, + { "Homeland Security & Public Ser", "HSPS" }, + { "McDaniel Hall", "MHALL" }, + { "Johnson Hall", "JHALL" }, + { "Meredith Residence Hall South", "MRDS" }, + { "OffSite", "OFST" }, + { "Alexandria Room", "ALEX" }, + { "Helen B. Schleman Hall", "SCHM" }, + { "Winifred Parker Residence Hall", "PKRW" }, + { "Technology Building", "TECHB" }, + { "Aviation Technology Center", "ATC" }, + { "Homeland Security Building", "HSB" }, + { "Muncie Central", "MCHS" }, // I made this short code up since this is a dual-credit + // course at Muncie Central High School, presumably + // doesn't have a real short code. + }; } } \ No newline at end of file diff --git a/src/Tests/ParsingTests.cs b/src/Tests/ParsingTests.cs index 9c4db0a..576be9e 100644 --- a/src/Tests/ParsingTests.cs +++ b/src/Tests/ParsingTests.cs @@ -42,13 +42,21 @@ public async Task SectionParsing() Assert.Equal("", spotCheck.LinkOther); Assert.Equal("PWL", spotCheck.CampusCode); Assert.Equal("West Lafayette Campus", spotCheck.CampusName); - Assert.Equal(24, spotCheck.Capacity); - Assert.Equal(5, spotCheck.Enrolled); - Assert.Equal(19, spotCheck.RemainingSpace); + + // The loss of authenticated APIs removed our source of information for + // capacity, enrolled, remaining space, and waitlists without querying + // each CRN individually. + // So now these are all zero. + // Tracked here: https://github.com/Purdue-io/PurdueApi/issues/56 + Assert.Equal(0, spotCheck.Capacity); + Assert.Equal(0, spotCheck.Enrolled); + Assert.Equal(0, spotCheck.RemainingSpace); Assert.Equal(0, spotCheck.WaitListCapacity); Assert.Equal(0, spotCheck.WaitListCount); Assert.Equal(0, spotCheck.WaitListSpace); + Assert.Equal(2, spotCheck.Meetings.Length); + // First meeting Meeting spotCheckMeeting = spotCheck.Meetings[0]; Assert.Equal("Lecture", spotCheckMeeting.Type);