{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":793176658,"defaultBranch":"main","name":"llm-adversarial-attacks","ownerLogin":"lena-lenkeit","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2024-04-28T16:37:18.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/92172030?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1714322240.0","currentOid":""},"activityList":{"items":[{"before":"acba309d2d9acdbefde999b910379c5dfada385f","after":"bcc0b844bae9d09c195f5df4eac143b62567bd57","ref":"refs/heads/main","pushedAt":"2024-05-25T15:56:31.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added utility to convert adversarial tokens into a dataset","shortMessageHtmlLink":"Added utility to convert adversarial tokens into a dataset"}},{"before":"a51de5a7e2c0bdacc5d623deff8fbf5aead9507d","after":"acba309d2d9acdbefde999b910379c5dfada385f","ref":"refs/heads/main","pushedAt":"2024-05-23T13:03:35.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Include cmd args in output file","shortMessageHtmlLink":"Include cmd args in output file"}},{"before":"675ed952a7b0979340f720189d76a9f8456e24ad","after":"a51de5a7e2c0bdacc5d623deff8fbf5aead9507d","ref":"refs/heads/main","pushedAt":"2024-05-23T11:06:57.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Flag to disable validation model reloading","shortMessageHtmlLink":"Flag to disable validation model reloading"}},{"before":"ba39d3871e062e348d6c686c9464f0067dd316ed","after":"675ed952a7b0979340f720189d76a9f8456e24ad","ref":"refs/heads/main","pushedAt":"2024-05-23T10:48:37.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added model cache dir to adv. token script","shortMessageHtmlLink":"Added model cache dir to adv. token script"}},{"before":"753988d5462ff21d3aa24cb527dc788ee13ca41e","after":"ba39d3871e062e348d6c686c9464f0067dd316ed","ref":"refs/heads/main","pushedAt":"2024-05-22T17:01:56.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Fixed patched generation dtype issue","shortMessageHtmlLink":"Fixed patched generation dtype issue"}},{"before":"816ee675089708b92e012e99cd636f662578fcdb","after":"753988d5462ff21d3aa24cb527dc788ee13ca41e","ref":"refs/heads/main","pushedAt":"2024-05-22T16:27:03.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added more patching modes\n\n- Modes are now flags\n- New base mode: set_project (set value along the patch axis)\n- Extensions of set_project: set_logit and set_z, to directly set logit / z-score.\n- Flag to normalize probe weight","shortMessageHtmlLink":"Added more patching modes"}},{"before":"48a2f212cf99e7716739eb306b58d839725c82ca","after":"816ee675089708b92e012e99cd636f662578fcdb","ref":"refs/heads/main","pushedAt":"2024-05-21T23:38:07.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added script to generate continuations from a probe patched model","shortMessageHtmlLink":"Added script to generate continuations from a probe patched model"}},{"before":"7ff8ba207795bc98ed393f7f9787cd562a2a7e15","after":"48a2f212cf99e7716739eb306b58d839725c82ca","ref":"refs/heads/main","pushedAt":"2024-05-21T12:01:47.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Slightly faster dataset preparation during probe training and evaluation","shortMessageHtmlLink":"Slightly faster dataset preparation during probe training and evaluation"}},{"before":"b5485a6abd531f444267d576d7041b213038bd32","after":"7ff8ba207795bc98ed393f7f9787cd562a2a7e15","ref":"refs/heads/main","pushedAt":"2024-05-20T21:37:37.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Switched to directly saving all best probes according to each split","shortMessageHtmlLink":"Switched to directly saving all best probes according to each split"}},{"before":"09c48714e029058ad0f34a0993bfbc1ae2658857","after":"b5485a6abd531f444267d576d7041b213038bd32","ref":"refs/heads/main","pushedAt":"2024-05-20T21:11:34.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Correct round trip checking for batched sampling","shortMessageHtmlLink":"Correct round trip checking for batched sampling"}},{"before":"efedd7137c8dd35b624541992047750d424c64b4","after":"09c48714e029058ad0f34a0993bfbc1ae2658857","ref":"refs/heads/main","pushedAt":"2024-05-20T21:10:49.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Allow switching between selection metrics for best probe","shortMessageHtmlLink":"Allow switching between selection metrics for best probe"}},{"before":"112cbf400d95d3363f80c6cd4b418c623b150440","after":"efedd7137c8dd35b624541992047750d424c64b4","ref":"refs/heads/main","pushedAt":"2024-05-20T16:01:51.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Correctly handle batched updating of best loss and values","shortMessageHtmlLink":"Correctly handle batched updating of best loss and values"}},{"before":"b9af79ce35b841d24b199544e747c2730123339c","after":"112cbf400d95d3363f80c6cd4b418c623b150440","ref":"refs/heads/main","pushedAt":"2024-05-20T14:57:37.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Update script todo","shortMessageHtmlLink":"Update script todo"}},{"before":"1d0c8e7796c065856298cc8a3d26a1db5b56daa1","after":"b9af79ce35b841d24b199544e747c2730123339c","ref":"refs/heads/main","pushedAt":"2024-05-20T12:58:33.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Set hard softmax as default method","shortMessageHtmlLink":"Set hard softmax as default method"}},{"before":"2f7a953047590047482cf9fdd6666f2b61565714","after":"1d0c8e7796c065856298cc8a3d26a1db5b56daa1","ref":"refs/heads/main","pushedAt":"2024-05-17T23:40:30.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Refactored max. act. script\n\n- Moved some code into functions to reduce repetition\n- Correctly calculating token boundaries and probe position\n- Made validation section much cleaner","shortMessageHtmlLink":"Refactored max. act. script"}},{"before":"863a37c47e713b350ff4fe165308e7d0166211bd","after":"2f7a953047590047482cf9fdd6666f2b61565714","ref":"refs/heads/main","pushedAt":"2024-05-17T15:42:41.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Optimized activation sampling cache creation and merging","shortMessageHtmlLink":"Optimized activation sampling cache creation and merging"}},{"before":"e7c8993d4ebefc1beaf359356740ca4d6b429677","after":"863a37c47e713b350ff4fe165308e7d0166211bd","ref":"refs/heads/main","pushedAt":"2024-05-17T14:38:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Allow sampling activations for entire dataset","shortMessageHtmlLink":"Allow sampling activations for entire dataset"}},{"before":"f89aa8cda76c54f6f97ee0bfab183aee9b6846a7","after":"e7c8993d4ebefc1beaf359356740ca4d6b429677","ref":"refs/heads/main","pushedAt":"2024-05-16T21:40:55.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added saving of found adversarial tokens and various metrics","shortMessageHtmlLink":"Added saving of found adversarial tokens and various metrics"}},{"before":"09dbb5c7050e3227bd79710162b32c7c54838d56","after":"f89aa8cda76c54f6f97ee0bfab183aee9b6846a7","ref":"refs/heads/main","pushedAt":"2024-05-16T17:55:23.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Rename activation sampling script","shortMessageHtmlLink":"Rename activation sampling script"}},{"before":"d5aa4a21f23d1411598a2e0eeda4cafa1821a2c4","after":"09dbb5c7050e3227bd79710162b32c7c54838d56","ref":"refs/heads/main","pushedAt":"2024-05-16T17:53:56.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Faster activation sampling by sorting across token counts","shortMessageHtmlLink":"Faster activation sampling by sorting across token counts"}},{"before":"bfa3f9d0857fee6af0550369192d7fa0ab5700bd","after":"d5aa4a21f23d1411598a2e0eeda4cafa1821a2c4","ref":"refs/heads/main","pushedAt":"2024-05-16T17:08:41.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added conversion scripts for multiple new datasets\n\n- BeaverTails\n- Financial Phrasebank\n- IMDB\n- Rotten Tomatoes\n- Yelp Review","shortMessageHtmlLink":"Added conversion scripts for multiple new datasets"}},{"before":"f26dbc60d8e5c564d7e26ca8d5e22009463c40ff","after":"bfa3f9d0857fee6af0550369192d7fa0ab5700bd","ref":"refs/heads/main","pushedAt":"2024-05-16T15:03:25.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Changed probe storage format\n\n- Split evals from data\n- Two files, one for probes, one for metrics and evals","shortMessageHtmlLink":"Changed probe storage format"}},{"before":"e8840a5e65e98fe07533255416edce5a7a2e3bea","after":"f26dbc60d8e5c564d7e26ca8d5e22009463c40ff","ref":"refs/heads/main","pushedAt":"2024-05-15T21:43:15.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Allow training probes on all layers and searching for best probe","shortMessageHtmlLink":"Allow training probes on all layers and searching for best probe"}},{"before":"de5abd8a3e7c1fe99ce4a9c0d65bd74193dd5607","after":"e8840a5e65e98fe07533255416edce5a7a2e3bea","ref":"refs/heads/main","pushedAt":"2024-05-14T22:19:54.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Hard kernel in kernel plot util","shortMessageHtmlLink":"Hard kernel in kernel plot util"}},{"before":"b943b742b931b11b5deba05d9d5a279129bb4cd5","after":"de5abd8a3e7c1fe99ce4a9c0d65bd74193dd5607","ref":"refs/heads/main","pushedAt":"2024-05-14T20:52:43.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Infer probe layer id when not provided","shortMessageHtmlLink":"Infer probe layer id when not provided"}},{"before":"f019d4cbf2f1d9d2839da340ef7c1567eac21c23","after":"b943b742b931b11b5deba05d9d5a279129bb4cd5","ref":"refs/heads/main","pushedAt":"2024-05-14T19:41:09.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Overhaul of max token script to be completely adjustable via cmd arguments","shortMessageHtmlLink":"Overhaul of max token script to be completely adjustable via cmd argu…"}},{"before":"59a73d3c9c8b60e7fe8ae6bc307a4d72ce0cd880","after":"f019d4cbf2f1d9d2839da340ef7c1567eac21c23","ref":"refs/heads/main","pushedAt":"2024-05-14T11:59:36.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added and tested various different token optimization methods to max act. script (Heavy WIP)\n\nIncludes all of the following (commiting now because I'm reworking the script to be much cleaner and more user-friendly, but I don't want to accidentally remove functionality).\n\n- Includes various attempts based on learning token mixing matrices instead of embeddings (softmax, gumbel softmax, entropy penalization, max. prob. maximization)\n- Distance regularization kernels\n- Parameter schedules\n- Realism loss\n- Per-token embedding line search instead of SGD\n- Projected gradient descent instead of regularization","shortMessageHtmlLink":"Added and tested various different token optimization methods to max …"}},{"before":"cc6974dc35293e98c2c6488cd7e46550ec0f5ea8","after":"59a73d3c9c8b60e7fe8ae6bc307a4d72ce0cd880","ref":"refs/heads/main","pushedAt":"2024-05-04T23:48:04.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added a plotting script for finding good regularization kernels in 2D","shortMessageHtmlLink":"Added a plotting script for finding good regularization kernels in 2D"}},{"before":"192e4d36138a4db7aa8bb259613ae6abf8907b44","after":"cc6974dc35293e98c2c6488cd7e46550ec0f5ea8","ref":"refs/heads/main","pushedAt":"2024-05-04T14:26:27.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":".gitignore datasets folder","shortMessageHtmlLink":".gitignore datasets folder"}},{"before":"dfbd36c6f4000c3477923b1f1fad29b24c618362","after":"192e4d36138a4db7aa8bb259613ae6abf8907b44","ref":"refs/heads/main","pushedAt":"2024-05-04T14:06:42.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"lena-lenkeit","name":"Lena Maxine Lenkeit","path":"/lena-lenkeit","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/92172030?s=80&v=4"},"commit":{"message":"Added missing target_text\n\nWas missing from previous commits","shortMessageHtmlLink":"Added missing target_text"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"Y3Vyc29yOnYyOpK7MjAyNC0wNS0yNVQxNTo1NjozMS4wMDAwMDBazwAAAARTxxEw","startCursor":"Y3Vyc29yOnYyOpK7MjAyNC0wNS0yNVQxNTo1NjozMS4wMDAwMDBazwAAAARTxxEw","endCursor":"Y3Vyc29yOnYyOpK7MjAyNC0wNS0wNFQxNDowNjo0Mi4wMDAwMDBazwAAAARBchMm"}},"title":"Activity · lena-lenkeit/llm-adversarial-attacks"}