-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.html
1270 lines (1196 loc) · 127 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-C1CRWDNJ1J"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-C1CRWDNJ1J');
</script>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"><title>HF. 17 papers. December 20.</title>
<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml">
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;700&display=swap" rel="stylesheet">
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100..900&family=Tiny5&display=swap" rel="stylesheet">
<style>
:root {
--primary-color: cornflowerblue;
--primary-color-dark: #fffd87cf;
--secondary-color: #fff;
--background-color: #eee;
--text-color: #333333;
--header-color: cornflowerblue;
--body-color: #eee;
--menu-color: #002370;
}
.background-digit {
position: absolute;
font-family: 'Tiny5';
bottom: -20px;
right: -10px;
font-size: 8em;
font-weight: 400;
color: #0989ea22;
z-index: 2;
line-height: 1;
}
.dark-theme .background-digit {
color: #e9e78f3d;
}
body {
font-family: 'Roboto Slab', sans-serif;
line-height: 1.6;
color: var(--text-color);
margin: 0;
padding: 0;
min-height: 100vh;
display: flex;
flex-direction: column;
}
.container {
max-width: 1500px;
margin: 0 auto;
flex: 1 0 auto;
width: 100%
}
.a-clean {
color: var(--secondary-color);
text-decoration: none;
}
.a-clean:hover {
color: #fff;
}
header {
padding: 3.6em 0 2.4em 0;
text-align: center;
}
footer {
background-color: var(--primary-color);
color: white;
text-align: center;
margin-top: 2em;
flex-shrink: 0;
padding: 20px;
}
h1 {
font-size: 2.4em;
margin: 0;
font-weight: 700;
}
.article-title-cont {
margin: -21px -21px 0px -21px;
padding: 10px 20px;
background: cornflowerblue;
display: table;
min-height: 5.9em;
}
.dark-theme .article-title-cont {
background: #444444;
}
.article-title {
color: white;
}
.article-title h2 {
margin: 0px;
padding: 0px;
font-weight: 400;
text-align:center;
}
h2 {
# color: var(--primary-color);
font-size: 1.2em;
margin-top: 0;
margin-bottom: 0.5em;
}
header p {
font-size: 1.2em;
margin-top: 0.5em;
font-weight: 300;
}
main {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
gap: 1.5em;
padding: 10px 20px 20px 20px;
}
body.dark-tmeme>header {
background-color: background-color: #333333;
color: white;
}
body.dark-theme>div>main>article>div.article-content>p.meta {
color: #fff;
}
body.light-theme>div>main>article>div.article-content>p.meta {
color: #555;
}
body.dark-theme>div>main>article>div.article-content>p.pub-date {
color: #ccc;
}
body.light-theme>div>main>article>div.article-content>p.pub-date {
color: #555;
}
body.dark-theme>div>main>article>div.article-content>div.tags {
color: #ccc;
}
body.light-theme>div>main>article>div.article-content>div.tags {
color: #fff;
}
body.light-theme>header {
background-color: var(--header-color);
color: white;
}
article {
display: flex;
flex-direction: row;
justify-content: center;
}
.article-content {
border-radius: 5px;
border: 1px solid #ddd;
overflow: hidden;
transition: background-color 0.2s ease;
padding: 1.3em;
flex-grow: 1;
display: flex;
flex-direction: column;
position: relative;
z-index: 1;
cursor: pointer;
max-width: 800px;
position: relative;
}
body.dark-theme>div>main>article>div.article-content {
background-color: #444;
border: none;
}
body.light-theme>div>main>article>div.article-content {
background-color: #fff;
}
body.dark-theme>div>main>article>div.article-content:hover {
background-color: #414141;
}
body.light-theme>div>main>article>div.article-content:hover {
background-color: #fafafa;
}
.meta {
font-size: 0.9em;
margin-bottom: 0em;
font-weight: 500;
margin: 20px 0 0px 0;
padding-bottom: 20px;
border-bottom: 1px solid #ddd;
}
.pub-date {
font-size: 0.8em;
margin-bottom: 0.8em;
font-weight: 400;
text-align: right;
font-family: Roboto;
}
.tags {
font-size: 0.9em;
margin-bottom: 0;
position: absolute;
bottom: 0px;
font-weight: 300;
font-family: 'Roboto Slab';
background: #555;
left: 0;
width: 100%;
padding: 10px 20px;
}
.abstract {
position: relative;
max-height: 170px;
overflow: hidden;
transition: max-height 0.3s ease;
cursor: pointer;
}
.abstract.expanded {
max-height: 1000px;
}
.abstract-toggle {
position: absolute;
bottom: 4px;
right: 0;
cursor: pointer;
color: var(--primary-color);
float: right;
font-weight: 400;
}
.explanation {
background-color: #e8f5e9;
border-left: 4px solid var(--secondary-color);
padding: 1em;
margin-top: 1.5em;
}
.links {
margin-top: 1.5em;
margin-bottom: 20px;
}
.affiliations {
margin-bottom: 50px;
padding:10px;
font-size: 0.9em;
text-align: center
}
a {
color: var(--primary-color);
text-decoration: none;
font-weight: 500;
transition: color 0.3s ease;
}
.dark-theme a {
color: var(--primary-color-dark);
}
a:hover {
color: #e73838;
}
.light-theme {
background-color: var(--body-color);
color: #333333;
}
.dark-theme {
background-color: #333333;
color: #ffffff;
}
.theme-switch {
position: absolute;
top: 20px;
right: 20px;
display: flex;
align-items: center;
}
.switch {
position: relative;
display: inline-block;
width: 50px;
height: 30px;
}
.switch input {
opacity: 0;
width: 0;
height: 0;
}
.slider {
position: absolute;
cursor: pointer;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-color: #ccc;
transition: .4s;
border-radius: 30px;
}
.slider:before {
position: absolute;
content: "";
height: 24px;
width: 24px;
left: 3px;
bottom: 3px;
background-color: white;
transition: .4s;
border-radius: 50%;
}
input:checked + .slider {
background-color: var(--primary-color);
}
input:checked + .slider:before {
transform: translateX(20px);
}
.switch-label {
margin-right: 10px;
}
.sub-header-container {
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 15px;
margin-top: 7px;
padding: 0 20px;
}
.sub-header-container-2 {
display: flex;
justify-content: left;
align-items: center;
flex-wrap: wrap;
gap: 15px;
margin: 0 auto;
padding: 0 20px;
}
.update-info-container {
margin-top: 15px;
margin-bottom: 0px;
text-align: left;
flex: 1;
}
.sort-container {
margin-top: 15px;
margin-bottom: 0px;
text-align: right;
flex: 2;
}
.category-toggle-container {
display: inline-block;
margin-top: 15px;
margin-bottom: 10px;
cursor: pointer;
}
.category-option-container {
margin-top: 15px;
margin-bottom: 10px;
display: none;
margin-left: auto;
}
.category-option-container.expanded {
display: block;
}
.sort-dropdown {
padding: 5px 10px;
font-size: 16px;
border-radius: 5px;
border: 1px solid #ccc;
background-color: white;
color: var(--text-color);
font-family: 'Roboto Slab', sans-serif;
}
.sort-label {
margin-right: 10px;
font-size: 1.0em !important;
}
.dark-theme .sort-dropdown {
background-color: #444;
color: white;
border-color: var(--text-color);
}
.title-sign {
display: inline-block;
transition: all 0.5s ease;
}
.rotate {
transform: rotate(45deg) translateY(-6px);
transform-origin: center;
}
.title-text {
display: inline;
padding-left: 10px;
}
.summary_title {
font-size: 1.2em;
font-weight: bold;
color: #222;
margin-bottom: 5px;
}
.summary_text {
}
.summary_image {
max-height: 500px;
max-width: 100%;
align: center;
margin-top: 40px;
margin-bottom: 60px;
}
.category-filters {
margin-top: 20px;
margin-bottom: 20px;
text-align: center;
display: none;
}
.category-filters.expanded {
display: block;
margin-top: 10px;
}
.category-button {
display: inline-block;
margin: 5px;
padding: 5px 10px;
border-radius: 15px;
background-color: #f0f0f0;
color: #333;
cursor: pointer;
transition: background-color 0.3s ease;
}
.category-button.active {
background-color: var(--primary-color);
color: white;
}
.category-button.inactive:not(.active) {
color: #ccc;
}
.dark-theme .category-button {
background-color: #555;
color: #fff;
}
.dark-theme .category-button.active {
background-color: var(--primary-color);
}
.dark-theme .category-button.inactive:not(.active) {
color: #888;
}
.clear-categories {
display: inline-block;
margin: 5px;
padding: 5px 10px;
border-radius: 15px;
background-color: #f0f0f0;
color: #333;
cursor: pointer;
transition: background-color 0.3s ease;
}
.clear-categories:hover {
background-color: #bbb;
}
.svg-container {
display: inline-block;
position: relative;
overflow: hidden;
}
.svg-container span {
position: relative;
z-index: 1;
}
.svg-container svg {
position: absolute;
bottom: 0;
left: 0;
z-index: 0;
}
.nav-menu {
background-color: var(--menu-color);
padding: 2px 0 2px 0;
display: inline-block;
position: relative;
overflow: hidden;
width: 100%;
}
.nav-container {
max-width: 1500px;
margin: 0 auto;
display: flex;
justify-content: left;
gap: 3em;
}
.nav-container span a {
color: white;
}
.nav-item {
color: white;
padding: 3px 0px;
cursor: pointer;
font-weight: 400;
}
.nav-prev {
margin-left: 20px;
}
.nav-item:hover {
background-color: rgba(255, 255, 255, 0.1);
border-color: rgba(255, 255, 255, 0.3);
}
.language-flags {
display: flex;
gap: 7px;
padding: 5px 20px 0 0;
margin-left: auto;
}
.flag-svg {
width: 22px;
height: 22px;
cursor: pointer;
opacity: 0.4;
transition: opacity 0.3s ease;
border-radius: 2px;
}
.flag-svg.active {
opacity: 1;
}
.flag-svg:hover {
opacity: 0.8;
}
.dark-theme .nav-menu {
background-color: #333;
}
.dark-theme .nav-item {
color: white;
}
.dark-theme .nav-item:hover {
background-color: rgba(255, 255, 255, 0.05);
}
.pointer { cursor: pointer; }
.article-pdf-title-img {
max-width: 100%;
max-height: 400px;
display: inline-block;
margin-top: 10px;
margin-bottom: 10px;
border-radius: 5px;
}
.article-pdf-title-img-cont {
text-align: center;
}
.dark-theme .article-pdf-title-img {
opacity: 0.8;
filter: grayscale(1);
}
@media (max-width: 600px) {
.nav-container {
flex-direction: row;
gap: 1.5em;
}
.nav-item {
padding: 3px 0px;
}
}
@media (max-width: 768px) {
.category-filters {
display: none;
}
.category-toggle {
display: inline-block;
width: 100%;
text-align: left;
}
.category-filters.expanded {
display: block;
margin-top: 10px;
}
}
@media (max-width: 600px) {
.sub-header-container {
flex-direction: column;
align-items: flex-start;
}
.sort-container {
width: 100%;
display: flex;
justify-content: left;
margin: 0 auto;
}
.sort-dropdown {
margin-left: auto;
}
.sort-label {
margin-top: 5px;
float: left;
}
.sub-header-container-2 {
flex-direction: row;
align-items: flex-start;
}
.update-info-container {
text-align: left;
width: 100%;
margin-bottom: 0px;
}
.category-toggle-container {
margin-top: 15px;
text-align: left;
margin-bottom: 10px;
}
.category-option-container {
margin-top: 15px;
text-align: center;
margin-bottom: 10px;
}
main {
grid-template-columns: repeat(auto-fit);
gap: 0em;
padding: 10px 0 20px 0;
}
footer {
margin-top: -20px;
}
article>div.article-content {
border-radius: 0px;
}
}
</style>
<script>
function toggleAbstract(id) {
var abstract = document.getElementById('abstract-' + id);
var toggle = document.getElementById('toggle-' + id);
if (abstract.classList.contains('expanded')) {
abstract.classList.remove('expanded');
toggle.textContent = '...';
} else {
abstract.classList.add('expanded');
toggle.textContent = '';
}
}
function getTimeDiff(dateString, lang='ru') {
const timeUnits = {
ru: {
minute: ["минуту", "минуты", "минут"],
hour: ["час", "часа", "часов"],
day: ["день", "дня", "дней"],
justNow: "только что",
ago: "назад"
},
en: {
minute: ["minute", "minutes", "minutes"],
hour: ["hour", "hours", "hours"],
day: ["day", "days", "days"],
justNow: "just now",
ago: "ago"
},
zh: {
minute: ["分钟", "分钟", "分钟"],
hour: ["小时", "小时", "小时"],
day: ["天", "天", "天"],
justNow: "刚刚",
ago: "前"
}
};
function getPlural(number, words, lang) {
if (lang === 'ru') {
if (number % 10 === 1 && number % 100 !== 11) {
return words[0];
} else if (number % 10 >= 2 && number % 10 <= 4 && (number % 100 < 10 || number % 100 >= 20)) {
return words[1];
} else {
return words[2];
}
} else if (lang === 'en') {
return number === 1 ? words[0] : words[1];
} else {
// Chinese doesn't need plural forms
return words[0];
}
}
function formatTimeDiff(number, unit, lang) {
const unitWord = getPlural(number, timeUnits[lang][unit], lang);
if (lang === 'zh') {
return `${number}${unitWord}${timeUnits[lang].ago}`;
} else {
return `${number} ${unitWord} ${timeUnits[lang].ago}`;
}
}
if (!['ru', 'en', 'zh'].includes(lang)) {
throw new Error('Unsupported language. Supported languages are: ru, en, zh');
}
const pastDate = new Date(dateString.replace(" ", "T") + ":00Z");
const currentDate = new Date();
const diffInSeconds = Math.floor((currentDate - pastDate) / 1000);
const minutes = Math.floor(diffInSeconds / 60);
const hours = Math.floor(diffInSeconds / 3600);
const days = Math.floor(diffInSeconds / 86400);
if (minutes === 0) {
return timeUnits[lang].justNow;
} else if (minutes < 60) {
return formatTimeDiff(minutes, 'minute', lang);
} else if (hours < 24) {
return formatTimeDiff(hours, 'hour', lang);
} else {
return formatTimeDiff(days, 'day', lang);
}
}
function isToday(dateString) {
const inputDate = new Date(dateString);
const today = new Date();
return (
inputDate.getFullYear() === today.getFullYear() &&
inputDate.getMonth() === today.getMonth() &&
inputDate.getDate() === today.getDate()
);
}
function isCurrentMonth(dateString) {
const inputDate = new Date(dateString);
const today = new Date();
return (
inputDate.getFullYear() === today.getFullYear() &&
inputDate.getMonth() === today.getMonth()
);
}
function formatArticlesTitle(number, lang='ru') {
const lastDigit = number % 10;
const lastTwoDigits = number % 100;
let word;
if (!['ru', 'en', 'zh'].includes(lang)) {
throw new Error('Unsupported language. Supported languages are: ru, en, zh');
}
if (lang === 'ru') {
if (lastTwoDigits >= 11 && lastTwoDigits <= 14) {
word = "статей";
} else if (lastDigit === 1) {
word = "статья";
} else if (lastDigit >= 2 && lastDigit <= 4) {
word = "статьи";
} else {
word = "статей";
}
} else if (lang === 'en') {
if (number === 1) {
word = 'paper'
} else {
word = 'papers'
}
} else if (lang === 'zh') {
word = "篇论文"
}
if (lang === 'zh') {
return `${number}${word}`;
} else {
return `${number} ${word}`;
}
}
</script>
</head>
<body class="light-theme">
<header>
<div class="container">
<a href="https://hfday.ru" class="a-clean"><h1 class="title-sign" id="doomgrad-icon">🔺</h1><h1 class="title-text" id="doomgrad">hf daily</h1></a>
<p><span id="title-date">20 декабря</span> | <span id="title-articles-count">17 papers</span></p>
</div>
<div class="theme-switch">
<label class="switch">
<input type="checkbox" id="theme-toggle">
<span class="slider"></span>
</label>
</div>
</header>
<div class="nav-menu">
<div class="nav-container">
<span class="nav-item nav-prev" id="nav-prev"><a href="/d/2024-12-19.html">⬅️ <span id="prev-date">19.12</span></a></span>
<span class="nav-item" id="nav-next"><a href="/d/2024-12-23.html">➡️ <span id="next-date">23.12</span></a></span>
<span class="nav-item" id="nav-monthly"><a href="/m/2024-12.html">📈 <span id='top-month-label'>Месяц</span></a></span>
<div class="language-flags">
<svg class="flag-svg" data-lang="ru" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><path fill="#1435a1" d="M1 11H31V21H1z"></path><path d="M5,4H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" fill="#fff"></path><path d="M5,20H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" transform="rotate(180 16 24)" fill="#c53a28"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
<svg class="flag-svg" data-lang="zh" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#db362f"></rect><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path fill="#ff0" d="M7.958 10.152L7.19 7.786 6.421 10.152 3.934 10.152 5.946 11.614 5.177 13.979 7.19 12.517 9.202 13.979 8.433 11.614 10.446 10.152 7.958 10.152z"></path><path fill="#ff0" d="M12.725 8.187L13.152 8.898 13.224 8.072 14.032 7.886 13.269 7.562 13.342 6.736 12.798 7.361 12.035 7.037 12.461 7.748 11.917 8.373 12.725 8.187z"></path><path fill="#ff0" d="M14.865 10.372L14.982 11.193 15.37 10.46 16.187 10.602 15.61 10.007 15.997 9.274 15.253 9.639 14.675 9.044 14.793 9.865 14.048 10.23 14.865 10.372z"></path><path fill="#ff0" d="M15.597 13.612L16.25 13.101 15.421 13.13 15.137 12.352 14.909 13.149 14.081 13.179 14.769 13.642 14.541 14.439 15.194 13.928 15.881 14.391 15.597 13.612z"></path><path fill="#ff0" d="M13.26 15.535L13.298 14.707 12.78 15.354 12.005 15.062 12.46 15.754 11.942 16.402 12.742 16.182 13.198 16.875 13.236 16.047 14.036 15.827 13.26 15.535z"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
<svg class="flag-svg" data-lang="en" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#fff"></rect><path d="M1.638,5.846H30.362c-.711-1.108-1.947-1.846-3.362-1.846H5c-1.414,0-2.65,.738-3.362,1.846Z" fill="#a62842"></path><path d="M2.03,7.692c-.008,.103-.03,.202-.03,.308v1.539H31v-1.539c0-.105-.022-.204-.03-.308H2.03Z" fill="#a62842"></path><path fill="#a62842" d="M2 11.385H31V13.231H2z"></path><path fill="#a62842" d="M2 15.077H31V16.923000000000002H2z"></path><path fill="#a62842" d="M1 18.769H31V20.615H1z"></path><path d="M1,24c0,.105,.023,.204,.031,.308H30.969c.008-.103,.031-.202,.031-.308v-1.539H1v1.539Z" fill="#a62842"></path><path d="M30.362,26.154H1.638c.711,1.108,1.947,1.846,3.362,1.846H27c1.414,0,2.65-.738,3.362-1.846Z" fill="#a62842"></path><path d="M5,4h11v12.923H1V8c0-2.208,1.792-4,4-4Z" fill="#102d5e"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path><path fill="#fff" d="M4.601 7.463L5.193 7.033 4.462 7.033 4.236 6.338 4.01 7.033 3.279 7.033 3.87 7.463 3.644 8.158 4.236 7.729 4.827 8.158 4.601 7.463z"></path><path fill="#fff" d="M7.58 7.463L8.172 7.033 7.441 7.033 7.215 6.338 6.989 7.033 6.258 7.033 6.849 7.463 6.623 8.158 7.215 7.729 7.806 8.158 7.58 7.463z"></path><path fill="#fff" d="M10.56 7.463L11.151 7.033 10.42 7.033 10.194 6.338 9.968 7.033 9.237 7.033 9.828 7.463 9.603 8.158 10.194 7.729 10.785 8.158 10.56 7.463z"></path><path fill="#fff" d="M6.066 9.283L6.658 8.854 5.927 8.854 5.701 8.158 5.475 8.854 4.744 8.854 5.335 9.283 5.109 9.979 5.701 9.549 6.292 9.979 6.066 9.283z"></path><path fill="#fff" d="M9.046 9.283L9.637 8.854 8.906 8.854 8.68 8.158 8.454 8.854 7.723 8.854 8.314 9.283 8.089 9.979 8.68 9.549 9.271 9.979 9.046 9.283z"></path><path fill="#fff" d="M12.025 9.283L12.616 8.854 11.885 8.854 11.659 8.158 11.433 8.854 10.702 8.854 11.294 9.283 11.068 9.979 11.659 9.549 12.251 9.979 12.025 9.283z"></path><path fill="#fff" d="M6.066 12.924L6.658 12.494 5.927 12.494 5.701 11.799 5.475 12.494 4.744 12.494 5.335 12.924 5.109 13.619 5.701 13.19 6.292 13.619 6.066 12.924z"></path><path fill="#fff" d="M9.046 12.924L9.637 12.494 8.906 12.494 8.68 11.799 8.454 12.494 7.723 12.494 8.314 12.924 8.089 13.619 8.68 13.19 9.271 13.619 9.046 12.924z"></path><path fill="#fff" d="M12.025 12.924L12.616 12.494 11.885 12.494 11.659 11.799 11.433 12.494 10.702 12.494 11.294 12.924 11.068 13.619 11.659 13.19 12.251 13.619 12.025 12.924z"></path><path fill="#fff" d="M13.539 7.463L14.13 7.033 13.399 7.033 13.173 6.338 12.947 7.033 12.216 7.033 12.808 7.463 12.582 8.158 13.173 7.729 13.765 8.158 13.539 7.463z"></path><path fill="#fff" d="M4.601 11.104L5.193 10.674 4.462 10.674 4.236 9.979 4.01 10.674 3.279 10.674 3.87 11.104 3.644 11.799 4.236 11.369 4.827 11.799 4.601 11.104z"></path><path fill="#fff" d="M7.58 11.104L8.172 10.674 7.441 10.674 7.215 9.979 6.989 10.674 6.258 10.674 6.849 11.104 6.623 11.799 7.215 11.369 7.806 11.799 7.58 11.104z"></path><path fill="#fff" d="M10.56 11.104L11.151 10.674 10.42 10.674 10.194 9.979 9.968 10.674 9.237 10.674 9.828 11.104 9.603 11.799 10.194 11.369 10.785 11.799 10.56 11.104z"></path><path fill="#fff" d="M13.539 11.104L14.13 10.674 13.399 10.674 13.173 9.979 12.947 10.674 12.216 10.674 12.808 11.104 12.582 11.799 13.173 11.369 13.765 11.799 13.539 11.104z"></path><path fill="#fff" d="M4.601 14.744L5.193 14.315 4.462 14.315 4.236 13.619 4.01 14.315 3.279 14.315 3.87 14.744 3.644 15.44 4.236 15.01 4.827 15.44 4.601 14.744z"></path><path fill="#fff" d="M7.58 14.744L8.172 14.315 7.441 14.315 7.215 13.619 6.989 14.315 6.258 14.315 6.849 14.744 6.623 15.44 7.215 15.01 7.806 15.44 7.58 14.744z"></path><path fill="#fff" d="M10.56 14.744L11.151 14.315 10.42 14.315 10.194 13.619 9.968 14.315 9.237 14.315 9.828 14.744 9.603 15.44 10.194 15.01 10.785 15.44 10.56 14.744z"></path><path fill="#fff" d="M13.539 14.744L14.13 14.315 13.399 14.315 13.173 13.619 12.947 14.315 12.216 14.315 12.808 14.744 12.582 15.44 13.173 15.01 13.765 15.44 13.539 14.744z"></path></svg>
</div>
</div>
</div>
<div class="container">
<div class="sub-header-container">
<div class="update-info-container">
<label class="update-info-label" id="timeDiff"></label>
</div>
<div class="sort-container">
<label class="sort-label">🔀 <span id="sort-label-text">Сортировка по</span></label>
<select id="sort-dropdown" class="sort-dropdown">
<option value="default">рейтингу</option>
<option value="pub_date">дате публикации</option>
<option value="issue_id">добавлению на HF</option>
</select>
</div>
</div>
<div class="sub-header-container-2">
<div class="category-toggle-container">
<div class="svg-container">
<span id="category-toggle">🏷️ Фильтр</span>
<svg height="3" width="200">
<line x1="0" y1="0" x2="200" y2="0"
stroke="black"
stroke-width="2"
stroke-dasharray="3, 3" />
</svg>
</div>
</div>
<div class="category-option-container" id="category-options">
<label class="pointer" for="filter-logic-or"><input type="radio" id="filter-logic-or" name="filter-logic" value="or"> A∪B</label>
<label class="pointer" for="filter-logic-and"><input type="radio" id="filter-logic-and" name="filter-logic" value="and"> A∩B</label>
</div>
</div>
<div class="category-filters" id="category-filters">
<span class="clear-categories" id="clear-categories">🧹</span>
<!-- Categories -->
</div>
<main id="articles-container">
<!-- Articles -->
</main>
</div>
<footer>
<div class="container">
<p><a style="color:white;" href="https://t.me/doomgrad">doomgrad</a> ✖️ <a style="color:white;" href="https://huggingface.co/papers">hugging face</a></p>
</div>
</footer>
<script>
// Language handling
let currentLang = localStorage.getItem('selectedLang') || 'en';
let feedDate = {'ru': '20 декабря', 'en': 'December 20', 'zh': '12月20日'};
let feedDateNext = {'ru': '23.12', 'en': '12/23', 'zh': '12月23日'};
let feedDatePrev = {'ru': '19.12', 'en': '12/19', 'zh': '12月19日'};
let filterLabel = {'ru': 'Фильтр', 'en': 'Topics', 'zh': '主题筛选'}
let publishedLabel = {'ru': 'статья от ', 'en': 'published on ', 'zh': '发表于'}
let sortLabel = {'ru': 'Сортировка по', 'en': 'Sort by', 'zh': '排序方式'}
let paperLabel = {'ru': 'Статья', 'en': 'Paper', 'zh': '论文'}
let topMonthLabel = {'ru': 'Месяц', 'en': 'Month', 'zh': '月度论文'}
let topDayLabel = {'ru': 'День', 'en': 'Day', 'zh': '日度论文'}
function initializeLanguageFlags() {
const flags = document.querySelectorAll('.flag-svg');
flags.forEach(flag => {
if (flag.dataset.lang === currentLang) {
flag.classList.add('active');
}
flag.addEventListener('click', () => {
flags.forEach(f => f.classList.remove('active'));
flag.classList.add('active');
currentLang = flag.dataset.lang;
localStorage.setItem('selectedLang', currentLang);
updateTimeDiffs();
updateLocalization();
filterAndRenderArticles();
});
});
}
function toggleTheme() {
const body = document.body;
body.classList.toggle('light-theme');
body.classList.toggle('dark-theme');
const isDarkMode = body.classList.contains('dark-theme');
localStorage.setItem('darkMode', isDarkMode);
if (isDarkMode) {
const title = document.getElementById('doomgrad');
title.innerHTML = "hf nightly";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.add('rotate');
} else {
const title = document.getElementById('doomgrad');
title.innerHTML = "hf daily";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.remove('rotate');
}
}
const articlesData = [{'id': 'https://huggingface.co/papers/2412.15115', 'title': 'Qwen2.5 Technical Report', 'url': 'https://huggingface.co/papers/2412.15115', 'abstract': 'In this report, we introduce Qwen2.5, a comprehensive series of large language models (LLMs) designed to meet diverse needs. Compared to previous iterations, Qwen 2.5 has been significantly improved during both the pre-training and post-training stages. In terms of pre-training, we have scaled the high-quality pre-training datasets from the previous 7 trillion tokens to 18 trillion tokens. This provides a strong foundation for common sense, expert knowledge, and reasoning capabilities. In terms of post-training, we implement intricate supervised finetuning with over 1 million samples, as well as multistage reinforcement learning. Post-training techniques enhance human preference, and notably improve long text generation, structural data analysis, and instruction following. To handle diverse and varied use cases effectively, we present Qwen2.5 LLM series in rich sizes. Open-weight offerings include base and instruction-tuned models, with quantized versions available. In addition, for hosted solutions, the proprietary models currently include two mixture-of-experts (MoE) variants: Qwen2.5-Turbo and Qwen2.5-Plus, both available from Alibaba Cloud Model Studio. Qwen2.5 has demonstrated top-tier performance on a wide range of benchmarks evaluating language understanding, reasoning, mathematics, coding, human preference alignment, etc. Specifically, the open-weight flagship Qwen2.5-72B-Instruct outperforms a number of open and proprietary models and demonstrates competitive performance to the state-of-the-art open-weight model, Llama-3-405B-Instruct, which is around 5 times larger. Qwen2.5-Turbo and Qwen2.5-Plus offer superior cost-effectiveness while performing competitively against GPT-4o-mini and GPT-4o respectively. Additionally, as the foundation, Qwen2.5 models have been instrumental in training specialized models such as Qwen2.5-Math, Qwen2.5-Coder, QwQ, and multimodal models.', 'score': 270, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '578b15d8a263e387', 'authors': ['Qwen', ':', 'An Yang', 'Baosong Yang', 'Beichen Zhang', 'Binyuan Hui', 'Bo Zheng', 'Bowen Yu', 'Chengyuan Li', 'Dayiheng Liu', 'Fei Huang', 'Haoran Wei', 'Huan Lin', 'Jian Yang', 'Jianhong Tu', 'Jianwei Zhang', 'Jianxin Yang', 'Jiaxi Yang', 'Jingren Zhou', 'Junyang Lin', 'Kai Dang', 'Keming Lu', 'Keqin Bao', 'Kexin Yang', 'Le Yu', 'Mei Li', 'Mingfeng Xue', 'Pei Zhang', 'Qin Zhu', 'Rui Men', 'Runji Lin', 'Tianhao Li', 'Tingyu Xia', 'Xingzhang Ren', 'Xuancheng Ren', 'Yang Fan', 'Yang Su', 'Yichang Zhang', 'Yu Wan', 'Yuqiong Liu', 'Zeyu Cui', 'Zhenru Zhang', 'Zihan Qiu'], 'affiliations': ['Alibaba Cloud Model Studio', 'Hugging Face Hub', 'Kaggle', 'ModelScope'], 'pdf_title_img': 'assets/pdf/title_img/2412.15115.jpg', 'data': {'categories': ['#benchmark', '#training', '#reasoning', '#alignment', '#multimodal', '#architecture', '#agi', '#dataset', '#optimization', '#open_source'], 'emoji': '🧠', 'ru': {'title': 'Qwen2.5: Новое поколение языковых моделей с улучшенной эффективностью и разнообразием применений', 'desc': 'Статья представляет серию больших языковых моделей Qwen2.5, разработанных для различных потребностей. Модели прошли значительные улучшения на этапах предобучения и постобучения, включая увеличение объема обучающих данных до 18 триллионов токенов. Применены техники тонкой настройки и многоэтапного обучения с подкреплением для улучшения генерации длинных текстов и следования инструкциям. Qwen2.5 демонстрирует высокую производительность в различных задачах, конкурируя с современными моделями, значительно превосходящими ее по размеру.'}, 'en': {'title': 'Qwen2.5: Elevating Language Models with Unmatched Scale and Precision', 'desc': "Qwen2.5 is a new series of large language models (LLMs) that have been enhanced through extensive pre-training and post-training processes. The pre-training phase utilized a massive dataset of 18 trillion tokens, significantly improving the model's common sense and reasoning abilities. In the post-training phase, advanced techniques like supervised finetuning and reinforcement learning were applied to refine the model's performance on tasks such as long text generation and instruction following. Qwen2.5 models are available in various sizes and configurations, demonstrating top performance across multiple benchmarks and applications, including specialized models for math and coding."}, 'zh': {'title': 'Qwen2.5:满足多样化需求的大型语言模型', 'desc': '本文介绍了Qwen2.5,这是一个全面的大型语言模型系列,旨在满足多样化的需求。与之前的版本相比,Qwen2.5在预训练和后训练阶段都有显著改进,预训练数据集从7万亿个标记扩展到18万亿个标记,为常识、专家知识和推理能力提供了坚实基础。后训练方面,采用了超过100万样本的复杂监督微调和多阶段强化学习,显著提升了人类偏好和长文本生成能力。Qwen2.5在语言理解、推理、数学、编码等多个基准测试中表现出色,尤其是其旗舰模型Qwen2.5-72B-Instruct在性能上超越了许多开放和专有模型。'}}}, {'id': 'https://huggingface.co/papers/2412.14835', 'title': 'Progressive Multimodal Reasoning via Active Retrieval', 'url': 'https://huggingface.co/papers/2412.14835', 'abstract': 'Multi-step multimodal reasoning tasks pose significant challenges for multimodal large language models (MLLMs), and finding effective ways to enhance their performance in such scenarios remains an unresolved issue. In this paper, we propose AR-MCTS, a universal framework designed to progressively improve the reasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo Tree Search (MCTS). Our approach begins with the development of a unified retrieval module that retrieves key supporting insights for solving complex reasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in automated multimodal reasoning verification, we employ the MCTS algorithm combined with an active retrieval mechanism, which enables the automatic generation of step-wise annotations. This strategy dynamically retrieves key insights for each reasoning step, moving beyond traditional beam search sampling to improve the diversity and reliability of the reasoning space. Additionally, we introduce a process reward model that aligns progressively to support the automatic verification of multimodal reasoning tasks. Experimental results across three complex multimodal reasoning benchmarks confirm the effectiveness of the AR-MCTS framework in enhancing the performance of various multimodal models. Further analysis demonstrates that AR-MCTS can optimize sampling diversity and accuracy, yielding reliable multimodal reasoning.', 'score': 53, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '749dab304f766614', 'authors': ['Guanting Dong', 'Chenghao Zhang', 'Mengjie Deng', 'Yutao Zhu', 'Zhicheng Dou', 'Ji-Rong Wen'], 'affiliations': ['Gaoling School of Artificial Intelligence, Renmin University of China'], 'pdf_title_img': 'assets/pdf/title_img/2412.14835.jpg', 'data': {'categories': ['#benchmark', '#reasoning', '#multimodal', '#architecture', '#optimization'], 'emoji': '🧠', 'ru': {'title': 'AR-MCTS: Новый подход к усилению мультимодальных рассуждений ИИ', 'desc': 'Статья представляет AR-MCTS - универсальную систему для улучшения способностей мультимодальных языковых моделей (MLLM) в решении сложных задач рассуждения. Система использует активное извлечение (AR) и метод Монте-Карло для деревьев поиска (MCTS) для генерации пошаговых аннотаций и динамического извлечения ключевых идей на каждом этапе рассуждения. AR-MCTS также включает модель вознаграждения процесса для автоматической проверки мультимодальных рассуждений. Эксперименты показали эффективность AR-MCTS в повышении производительности различных мультимодальных моделей на трех сложных тестах мультимодальных рассуждений.'}, 'en': {'title': 'Enhancing Multimodal Reasoning with AR-MCTS Framework', 'desc': 'This paper addresses the challenges faced by multimodal large language models (MLLMs) in performing multi-step reasoning tasks. The authors introduce a new framework called AR-MCTS, which combines Active Retrieval (AR) and Monte Carlo Tree Search (MCTS) to enhance the reasoning capabilities of MLLMs. By utilizing a unified retrieval module, the framework retrieves essential insights from a hybrid-modal corpus to assist in solving complex problems. The approach not only improves the diversity and reliability of reasoning but also incorporates a process reward model for automatic verification of multimodal reasoning tasks, demonstrating significant performance improvements in experiments.'}, 'zh': {'title': '提升多模态推理能力的AR-MCTS框架', 'desc': '本文提出了一种名为AR-MCTS的通用框架,旨在通过主动检索和蒙特卡洛树搜索来逐步提升多模态大语言模型(MLLMs)的推理能力。该方法首先开发了一个统一的检索模块,从混合模态检索库中提取解决复杂推理问题的关键支持信息。为了弥补自动化多模态推理验证的不足,我们结合了MCTS算法和主动检索机制,实现了逐步注释的自动生成。实验结果表明,AR-MCTS框架在提升多模态模型性能方面具有显著效果,优化了采样的多样性和准确性。'}}}, {'id': 'https://huggingface.co/papers/2412.14475', 'title': 'MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval', 'url': 'https://huggingface.co/papers/2412.14475', 'abstract': 'Despite the rapidly growing demand for multimodal retrieval, progress in this field remains severely constrained by a lack of training data. In this paper, we introduce MegaPairs, a novel data synthesis method that leverages vision language models (VLMs) and open-domain images, together with a massive synthetic dataset generated from this method. Our empirical analysis shows that MegaPairs generates high-quality data, enabling the multimodal retriever to significantly outperform the baseline model trained on 70times more data from existing datasets. Moreover, since MegaPairs solely relies on general image corpora and open-source VLMs, it can be easily scaled up, enabling continuous improvements in retrieval performance. In this stage, we produced more than 26 million training instances and trained several models of varying sizes using this data. These new models achieve state-of-the-art zero-shot performance across 4 popular composed image retrieval (CIR) benchmarks and the highest overall performance on the 36 datasets provided by MMEB. They also demonstrate notable performance improvements with additional downstream fine-tuning. Our produced dataset, well-trained models, and data synthesis pipeline will be made publicly available to facilitate the future development of this field.', 'score': 46, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '9cc225c1e0ce01c5', 'authors': ['Junjie Zhou', 'Zheng Liu', 'Ze Liu', 'Shitao Xiao', 'Yueze Wang', 'Bo Zhao', 'Chen Jason Zhang', 'Defu Lian', 'Yongping Xiong'], 'affiliations': ['Beijing Academy of Artificial Intelligence', 'Beijing University of Posts and Telecommunications', 'Shanghai Jiaotong University', 'The Hong Kong Polytechnic University', 'University of Science and Technology of China'], 'pdf_title_img': 'assets/pdf/title_img/2412.14475.jpg', 'data': {'categories': ['#benchmark', '#training', '#multimodal', '#synthetic', '#dataset', '#open_source', '#data'], 'emoji': '🔍', 'ru': {'title': 'MegaPairs: синтез данных для прорыва в мультимодальном поиске', 'desc': 'Статья представляет MegaPairs - новый метод синтеза данных для мультимодального поиска, использующий модели компьютерного зрения и языка (VLM) и изображения из открытых источников. Исследователи создали массивный синтетический датасет, который позволяет значительно улучшить производительность мультимодальных ретриверов по сравнению с базовыми моделями. Модели, обученные на этих данных, достигают state-of-the-art результатов в zero-shot режиме на нескольких бенчмарках составного поиска изображений (CIR). Авторы планируют открыть доступ к датасету, обученным моделям и пайплайну синтеза данных для дальнейшего развития этой области.'}, 'en': {'title': 'MegaPairs: Unlocking Multimodal Retrieval with Synthetic Data', 'desc': 'This paper presents MegaPairs, a new method for creating training data for multimodal retrieval tasks, which combine images and text. By using vision language models (VLMs) and a large collection of open-domain images, MegaPairs generates a synthetic dataset that significantly enhances the training process. The results show that models trained with MegaPairs outperform those trained on much larger existing datasets, achieving state-of-the-art performance in various benchmarks. The authors plan to make their dataset and models publicly available to support further advancements in multimodal retrieval research.'}, 'zh': {'title': 'MegaPairs:提升多模态检索的新方法', 'desc': '本论文介绍了一种名为MegaPairs的新数据合成方法,旨在解决多模态检索领域中训练数据不足的问题。该方法利用视觉语言模型(VLMs)和开放域图像,生成了一个大规模的合成数据集。实验结果表明,MegaPairs生成的数据质量高,使得多模态检索器的性能显著超过了基于现有数据集训练的基线模型。我们生成了超过2600万的训练实例,并训练了多种规模的模型,这些模型在多个基准测试中表现出色。'}}}, {'id': 'https://huggingface.co/papers/2412.14689', 'title': 'How to Synthesize Text Data without Model Collapse?', 'url': 'https://huggingface.co/papers/2412.14689', 'abstract': 'Model collapse in synthetic data indicates that iterative training on self-generated data leads to a gradual decline in performance. With the proliferation of AI models, synthetic data will fundamentally reshape the web data ecosystem. Future GPT-{n} models will inevitably be trained on a blend of synthetic and human-produced data. In this paper, we focus on two questions: what is the impact of synthetic data on language model training, and how to synthesize data without model collapse? We first pre-train language models across different proportions of synthetic data, revealing a negative correlation between the proportion of synthetic data and model performance. We further conduct statistical analysis on synthetic data to uncover distributional shift phenomenon and over-concentration of n-gram features. Inspired by the above findings, we propose token editing on human-produced data to obtain semi-synthetic data. As a proof of concept, we theoretically demonstrate that token-level editing can prevent model collapse, as the test error is constrained by a finite upper bound. We conduct extensive experiments on pre-training from scratch, continual pre-training, and supervised fine-tuning. The results validate our theoretical proof that token-level editing improves data quality and enhances model performance.', 'score': 32, 'issue_id': 1228, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'b10419cab812f04f', 'authors': ['Xuekai Zhu', 'Daixuan Cheng', 'Hengli Li', 'Kaiyan Zhang', 'Ermo Hua', 'Xingtai Lv', 'Ning Ding', 'Zhouhan Lin', 'Zilong Zheng', 'Bowen Zhou'], 'affiliations': ['Department of Electronic Engineering, Tsinghua University', 'Institute for Artificial Intelligence, Peking University', 'LUMIA Lab, Shanghai Jiao Tong University', 'Shanghai Artificial Intelligence Laboratory', 'State Key Laboratory of General Artificial Intelligence, BIGAI'], 'pdf_title_img': 'assets/pdf/title_img/2412.14689.jpg', 'data': {'categories': ['#dataset', '#training', '#synthetic', '#data'], 'emoji': '🧬', 'ru': {'title': 'Токенное редактирование: ключ к качественным синтетическим данным для языковых моделей', 'desc': 'Статья исследует влияние синтетических данных на обучение языковых моделей и способы их генерации без коллапса модели. Авторы обнаружили отрицательную корреляцию между долей синтетических данных и производительностью модели. Они предлагают метод редактирования токенов для получения полусинтетических данных, что теоретически предотвращает коллапс модели. Эксперименты подтверждают, что редактирование на уровне токенов улучшает качество данных и повышает производительность модели.'}, 'en': {'title': 'Preventing Model Collapse with Smart Data Editing', 'desc': "This paper investigates the effects of using synthetic data in training language models, highlighting that too much synthetic data can lead to model collapse, where performance declines. The authors find a negative relationship between the amount of synthetic data and the model's effectiveness, indicating that relying solely on synthetic data is detrimental. They introduce a method called token editing on human-produced data to create semi-synthetic data, which helps maintain model performance. Their experiments confirm that this approach improves data quality and prevents the issues associated with model collapse."}, 'zh': {'title': '合成数据与模型崩溃的挑战与解决方案', 'desc': '本论文探讨了合成数据对语言模型训练的影响,发现随着合成数据比例的增加,模型性能逐渐下降,出现模型崩溃现象。我们通过统计分析揭示了合成数据的分布偏移和n-gram特征的过度集中。为了解决这一问题,我们提出了对人类生成数据进行标记编辑,以获得半合成数据。实验结果验证了标记级编辑可以提高数据质量,从而提升模型性能。'}}}, {'id': 'https://huggingface.co/papers/2412.15204', 'title': 'LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks', 'url': 'https://huggingface.co/papers/2412.15204', 'abstract': 'This paper introduces LongBench v2, a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. LongBench v2 consists of 503 challenging multiple-choice questions, with contexts ranging from 8k to 2M words, across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding. To ensure the breadth and the practicality, we collect data from nearly 100 highly educated individuals with diverse professional backgrounds. We employ both automated and manual review processes to maintain high quality and difficulty, resulting in human experts achieving only 53.7% accuracy under a 15-minute time constraint. Our evaluation reveals that the best-performing model, when directly answers the questions, achieves only 50.1% accuracy. In contrast, the o1-preview model, which includes longer reasoning, achieves 57.7%, surpassing the human baseline by 4%. These results highlight the importance of enhanced reasoning ability and scaling inference-time compute to tackle the long-context challenges in LongBench v2. The project is available at https://longbench2.github.io.', 'score': 27, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '6cf25f1f8b2e5710', 'authors': ['Yushi Bai', 'Shangqing Tu', 'Jiajie Zhang', 'Hao Peng', 'Xiaozhi Wang', 'Xin Lv', 'Shulin Cao', 'Jiazheng Xu', 'Lei Hou', 'Yuxiao Dong', 'Jie Tang', 'Juanzi Li'], 'affiliations': ['Tsinghua University', 'Zhipu.AI'], 'pdf_title_img': 'assets/pdf/title_img/2412.15204.jpg', 'data': {'categories': ['#reasoning', '#long_context', '#dataset', '#benchmark'], 'emoji': '📏', 'ru': {'title': 'LongBench v2: Испытание для ИИ в работе с длинными контекстами', 'desc': 'LongBench v2 - это новый бенчмарк для оценки способности больших языковых моделей (LLM) работать с длинными контекстами. Он включает 503 сложных вопроса с множественным выбором в шести категориях задач, с контекстами от 8 тысяч до 2 миллионов слов. Данные собраны от почти 100 высококвалифицированных специалистов из разных областей, а качество и сложность обеспечиваются автоматическим и ручным рецензированием. Результаты показывают, что лучшая модель достигает точности 50.1% при прямом ответе на вопросы, в то время как модель o1-preview с более длительным рассуждением достигает 57.7%, превосходя человеческий базовый уровень на 4%.'}, 'en': {'title': "LongBench v2: Elevating LLMs' Long-Context Reasoning Skills", 'desc': 'This paper presents LongBench v2, a benchmark for evaluating large language models (LLMs) on long-context tasks that require advanced reasoning and understanding. It includes 503 multiple-choice questions with contexts ranging from 8,000 to 2 million words, covering various tasks like question answering and dialogue understanding. The benchmark was developed using data from nearly 100 educated individuals to ensure quality and difficulty, with human experts achieving only 53.7% accuracy under time constraints. The findings show that while the best LLMs perform around 50.1% accuracy, a model with enhanced reasoning capabilities can exceed human performance, emphasizing the need for improved reasoning in LLMs for long-context challenges.'}, 'zh': {'title': '提升推理能力,挑战长上下文问题', 'desc': '本文介绍了LongBench v2,这是一个基准测试,旨在评估大型语言模型(LLM)处理长上下文问题的能力。这些问题需要深入理解和推理,涵盖了多个现实世界的任务。LongBench v2包含503个具有挑战性的多项选择题,文本长度从8000到200万字不等,涉及六个主要任务类别。评估结果显示,最佳模型在直接回答问题时的准确率仅为50.1%,而经过更长推理的o1-preview模型则达到了57.7%,超越了人类基准。'}}}, {'id': 'https://huggingface.co/papers/2412.15213', 'title': 'Flowing from Words to Pixels: A Framework for Cross-Modality Evolution', 'url': 'https://huggingface.co/papers/2412.15213', 'abstract': 'Diffusion models, and their generalization, flow matching, have had a remarkable impact on the field of media generation. Here, the conventional approach is to learn the complex mapping from a simple source distribution of Gaussian noise to the target media distribution. For cross-modal tasks such as text-to-image generation, this same mapping from noise to image is learnt whilst including a conditioning mechanism in the model. One key and thus far relatively unexplored feature of flow matching is that, unlike Diffusion models, they are not constrained for the source distribution to be noise. Hence, in this paper, we propose a paradigm shift, and ask the question of whether we can instead train flow matching models to learn a direct mapping from the distribution of one modality to the distribution of another, thus obviating the need for both the noise distribution and conditioning mechanism. We present a general and simple framework, CrossFlow, for cross-modal flow matching. We show the importance of applying Variational Encoders to the input data, and introduce a method to enable Classifier-free guidance. Surprisingly, for text-to-image, CrossFlow with a vanilla transformer without cross attention slightly outperforms standard flow matching, and we show that it scales better with training steps and model size, while also allowing for interesting latent arithmetic which results in semantically meaningful edits in the output space. To demonstrate the generalizability of our approach, we also show that CrossFlow is on par with or outperforms the state-of-the-art for various cross-modal / intra-modal mapping tasks, viz. image captioning, depth estimation, and image super-resolution. We hope this paper contributes to accelerating progress in cross-modal media generation.', 'score': 17, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '7e00a5665592fb4d', 'authors': ['Qihao Liu', 'Xi Yin', 'Alan Yuille', 'Andrew Brown', 'Mannat Singh'], 'affiliations': ['GenAI, Meta', 'Johns Hopkins University'], 'pdf_title_img': 'assets/pdf/title_img/2412.15213.jpg', 'data': {'categories': ['#cv', '#diffusion', '#multimodal'], 'emoji': '🔀', 'ru': {'title': 'CrossFlow: Прямое отображение между модальностями без шума', 'desc': 'Статья представляет новый подход к кросс-модальной генерации медиа, называемый CrossFlow. В отличие от традиционных диффузионных моделей, CrossFlow использует согласование потоков для прямого отображения распределения одной модальности в другую. Авторы применяют вариационные энкодеры к входным данным и вводят метод для бесклассовой направленности. CrossFlow показывает улучшенные результаты в задачах генерации изображений по тексту, описания изображений, оценки глубины и суперразрешения.'}, 'en': {'title': 'Revolutionizing Cross-Modal Media Generation with CrossFlow', 'desc': "This paper introduces CrossFlow, a new framework for cross-modal flow matching that allows direct mapping between different media distributions without relying on Gaussian noise. Unlike traditional diffusion models, CrossFlow eliminates the need for a conditioning mechanism, simplifying the training process. The authors demonstrate that using Variational Encoders enhances the model's performance and enables Classifier-free guidance. Results show that CrossFlow not only outperforms standard flow matching in text-to-image tasks but also excels in various other cross-modal and intra-modal mapping tasks, indicating its broad applicability in media generation."}, 'zh': {'title': '跨模态流匹配的新思路', 'desc': '扩散模型及其推广的流匹配在媒体生成领域产生了显著影响。传统方法是从简单的高斯噪声源分布学习到目标媒体分布的复杂映射。本文提出了一种新的思路,探索如何直接从一种模态的分布映射到另一种模态的分布,省去噪声分布和条件机制的需求。我们提出了CrossFlow框架,并展示了其在文本到图像生成等跨模态任务中的优越性。'}}}, {'id': 'https://huggingface.co/papers/2412.14462', 'title': 'Affordance-Aware Object Insertion via Mask-Aware Dual Diffusion', 'url': 'https://huggingface.co/papers/2412.14462', 'abstract': 'As a common image editing operation, image composition involves integrating foreground objects into background scenes. In this paper, we expand the application of the concept of Affordance from human-centered image composition tasks to a more general object-scene composition framework, addressing the complex interplay between foreground objects and background scenes. Following the principle of Affordance, we define the affordance-aware object insertion task, which aims to seamlessly insert any object into any scene with various position prompts. To address the limited data issue and incorporate this task, we constructed the SAM-FB dataset, which contains over 3 million examples across more than 3,000 object categories. Furthermore, we propose the Mask-Aware Dual Diffusion (MADD) model, which utilizes a dual-stream architecture to simultaneously denoise the RGB image and the insertion mask. By explicitly modeling the insertion mask in the diffusion process, MADD effectively facilitates the notion of affordance. Extensive experimental results show that our method outperforms the state-of-the-art methods and exhibits strong generalization performance on in-the-wild images. Please refer to our code on https://github.com/KaKituken/affordance-aware-any.', 'score': 14, 'issue_id': 1228, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'd674ddd6732ab566', 'authors': ['Jixuan He', 'Wanhua Li', 'Ye Liu', 'Junsik Kim', 'Donglai Wei', 'Hanspeter Pfister'], 'affiliations': ['Boston College', 'Cornell Tech', 'Harvard University', 'The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2412.14462.jpg', 'data': {'categories': ['#cv', '#architecture', '#diffusion', '#dataset', '#synthetic'], 'emoji': '🎭', 'ru': {'title': 'Умная вставка объектов в сцены с учетом их возможностей', 'desc': 'Статья расширяет концепцию Affordance для задачи вставки объектов в сцены. Авторы создали датасет SAM-FB с более чем 3 миллионами примеров для обучения моделей. Предложена модель Mask-Aware Dual Diffusion (MADD), использующая двухпоточную архитектуру для одновременного шумоподавления RGB-изображения и маски вставки. Экспериментальные результаты показывают, что метод превосходит современные аналоги и демонстрирует хорошую обобщающую способность на реальных изображениях.'}, 'en': {'title': 'Seamless Object Insertion through Affordance Awareness', 'desc': 'This paper introduces a new approach to image composition by applying the concept of Affordance, which helps in understanding how objects can interact with their surroundings. It defines a novel task called affordance-aware object insertion, which allows for the seamless integration of objects into various scenes based on specific position prompts. To support this task, the authors created the SAM-FB dataset, featuring over 3 million examples from more than 3,000 object categories, addressing the challenge of limited data. The proposed Mask-Aware Dual Diffusion (MADD) model enhances the insertion process by using a dual-stream architecture to denoise both the image and the insertion mask, leading to improved performance over existing methods.'}, 'zh': {'title': '可供性驱动的图像合成新方法', 'desc': '本文探讨了图像合成中的前景物体与背景场景的复杂关系。我们引入了“可供性”这一概念,定义了可供性感知的物体插入任务,旨在将任意物体无缝插入任意场景。为了解决数据不足的问题,我们构建了SAM-FB数据集,包含超过300万例的3,000多个物体类别。此外,我们提出了Mask-Aware Dual Diffusion(MADD)模型,通过双流架构同时去噪RGB图像和插入掩码,从而有效促进可供性概念的实现。'}}}, {'id': 'https://huggingface.co/papers/2412.15214', 'title': 'LeviTor: 3D Trajectory Oriented Image-to-Video Synthesis', 'url': 'https://huggingface.co/papers/2412.15214', 'abstract': 'The intuitive nature of drag-based interaction has led to its growing adoption for controlling object trajectories in image-to-video synthesis. Still, existing methods that perform dragging in the 2D space usually face ambiguity when handling out-of-plane movements. In this work, we augment the interaction with a new dimension, i.e., the depth dimension, such that users are allowed to assign a relative depth for each point on the trajectory. That way, our new interaction paradigm not only inherits the convenience from 2D dragging, but facilitates trajectory control in the 3D space, broadening the scope of creativity. We propose a pioneering method for 3D trajectory control in image-to-video synthesis by abstracting object masks into a few cluster points. These points, accompanied by the depth information and the instance information, are finally fed into a video diffusion model as the control signal. Extensive experiments validate the effectiveness of our approach, dubbed LeviTor, in precisely manipulating the object movements when producing photo-realistic videos from static images. Project page: https://ppetrichor.github.io/levitor.github.io/', 'score': 12, 'issue_id': 1229, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'd40a26a749c77671', 'authors': ['Hanlin Wang', 'Hao Ouyang', 'Qiuyu Wang', 'Wen Wang', 'Ka Leong Cheng', 'Qifeng Chen', 'Yujun Shen', 'Limin Wang'], 'affiliations': ['Ant Group', 'State Key Laboratory for Novel Software Technology, Nanjing University', 'The Hong Kong University of Science and Technology', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2412.15214.jpg', 'data': {'categories': ['#video', '#diffusion', '#3d'], 'emoji': '🎞️', 'ru': {'title': '3D-контроль траектории для реалистичного синтеза видео из изображений', 'desc': 'Статья представляет новый метод синтеза видео из изображений с контролем траектории объекта в трехмерном пространстве. Авторы предлагают добавить глубину как новое измерение при перетаскивании объектов, что позволяет более точно управлять движением вне плоскости. Метод, названный LeviTor, абстрагирует маски объектов в несколько кластерных точек и использует их вместе с информацией о глубине и экземпляре в качестве сигнала управления для диффузионной модели видео. Эксперименты подтверждают эффективность подхода в создании фотореалистичных видео с точным контролем движения объектов.'}, 'en': {'title': 'Enhancing 3D Trajectory Control in Video Synthesis with Depth-Aware Dragging', 'desc': 'This paper introduces a new method for controlling object movements in 3D space during image-to-video synthesis using drag-based interaction. By incorporating depth information, users can specify the relative depth of points along a trajectory, enhancing the traditional 2D dragging approach. The proposed method, named LeviTor, simplifies the representation of object masks into cluster points, which are then utilized as control signals in a video diffusion model. Experimental results demonstrate that LeviTor effectively improves the precision of object manipulation, enabling the creation of realistic videos from static images.'}, 'zh': {'title': '引入深度维度,提升三维轨迹控制的创意空间', 'desc': '本论文提出了一种新的三维轨迹控制方法,旨在改善图像到视频合成中的拖动交互体验。通过引入深度维度,用户可以为轨迹上的每个点分配相对深度,从而更好地控制三维空间中的物体运动。我们的方法将物体掩膜抽象为少量聚类点,并结合深度信息和实例信息,作为控制信号输入到视频扩散模型中。实验结果表明,LeviTor方法在生成逼真的视频时,能够精确操控物体运动。'}}}, {'id': 'https://huggingface.co/papers/2412.15084', 'title': 'AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward Modeling', 'url': 'https://huggingface.co/papers/2412.15084', 'abstract': 'In this paper, we introduce AceMath, a suite of frontier math models that excel in solving complex math problems, along with highly effective reward models capable of evaluating generated solutions and reliably identifying the correct ones. To develop the instruction-tuned math models, we propose a supervised fine-tuning (SFT) process that first achieves competitive performance across general domains, followed by targeted fine-tuning for the math domain using a carefully curated set of prompts and synthetically generated responses. The resulting model, AceMath-72B-Instruct greatly outperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop math-specialized reward model, we first construct AceMath-RewardBench, a comprehensive and robust benchmark for evaluating math reward models across diverse problems and difficulty levels. After that, we present a systematic approach to build our math reward models. The resulting model, AceMath-72B-RM, consistently outperforms state-of-the-art reward models. Furthermore, when combining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest average rm@8 score across the math reasoning benchmarks. We will release model weights, training data, and evaluation benchmarks at: https://research.nvidia.com/labs/adlr/acemath', 'score': 10, 'issue_id': 1228, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'b99bb71eb45dbc5a', 'authors': ['Zihan Liu', 'Yang Chen', 'Mohammad Shoeybi', 'Bryan Catanzaro', 'Wei Ping'], 'affiliations': ['NVIDIA'], 'pdf_title_img': 'assets/pdf/title_img/2412.15084.jpg', 'data': {'categories': ['#open_source', '#dataset', '#benchmark', '#training', '#optimization', '#math', '#synthetic'], 'emoji': '🧮', 'ru': {'title': 'AceMath: Прорыв в решении сложных математических задач с помощью ИИ', 'desc': 'В этой статье представлена система AceMath - набор передовых математических моделей, способных решать сложные математические задачи. Авторы разработали эффективные модели вознаграждения для оценки генерируемых решений. Для создания инструктивно-настроенных математических моделей предложен процесс контролируемой тонкой настройки (SFT), который сначала достигает конкурентоспособной производительности в общих областях, а затем проводит целевую настройку для математической области. Результирующая модель AceMath-72B-Instruct значительно превосходит другие современные модели в решении математических задач.'}, 'en': {'title': 'AceMath: Revolutionizing Math Problem Solving with Advanced Models', 'desc': 'This paper presents AceMath, a collection of advanced mathematical models designed to solve complex math problems effectively. The authors introduce a supervised fine-tuning (SFT) process that enhances model performance in general domains before specializing in math through targeted training with curated prompts. The AceMath-72B-Instruct model significantly surpasses existing models like Qwen2.5-Math and GPT-4o in solving math problems. Additionally, the paper details the creation of AceMath-RewardBench, a benchmark for evaluating math reward models, leading to the development of AceMath-72B-RM, which outperforms other reward models in assessing solution accuracy.'}, 'zh': {'title': 'AceMath:数学问题解决的前沿模型', 'desc': '本文介绍了AceMath,这是一个前沿数学模型套件,擅长解决复杂的数学问题,并配备了高效的奖励模型,能够评估生成的解决方案并可靠地识别正确答案。为了开发指令调优的数学模型,我们提出了一种监督微调(SFT)过程,首先在一般领域中实现竞争性表现,然后通过精心策划的提示和合成生成的响应进行针对数学领域的微调。最终模型AceMath-72B-Instruct在性能上大幅超越了Qwen2.5-Math-72B-Instruct、GPT-4o和Claude-3.5 Sonnet。我们还构建了AceMath-RewardBench,这是一个全面且强大的基准,用于评估数学奖励模型在不同问题和难度级别上的表现。'}}}, {'id': 'https://huggingface.co/papers/2412.15200', 'title': 'DI-PCG: Diffusion-based Efficient Inverse Procedural Content Generation for High-quality 3D Asset Creation', 'url': 'https://huggingface.co/papers/2412.15200', 'abstract': 'Procedural Content Generation (PCG) is powerful in creating high-quality 3D contents, yet controlling it to produce desired shapes is difficult and often requires extensive parameter tuning. Inverse Procedural Content Generation aims to automatically find the best parameters under the input condition. However, existing sampling-based and neural network-based methods still suffer from numerous sample iterations or limited controllability. In this work, we present DI-PCG, a novel and efficient method for Inverse PCG from general image conditions. At its core is a lightweight diffusion transformer model, where PCG parameters are directly treated as the denoising target and the observed images as conditions to control parameter generation. DI-PCG is efficient and effective. With only 7.6M network parameters and 30 GPU hours to train, it demonstrates superior performance in recovering parameters accurately, and generalizing well to in-the-wild images. Quantitative and qualitative experiment results validate the effectiveness of DI-PCG in inverse PCG and image-to-3D generation tasks. DI-PCG offers a promising approach for efficient inverse PCG and represents a valuable exploration step towards a 3D generation path that models how to construct a 3D asset using parametric models.', 'score': 8, 'issue_id': 1228, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': '7a7e2f117e332add', 'authors': ['Wang Zhao', 'Yan-Pei Cao', 'Jiale Xu', 'Yuejiang Dong', 'Ying Shan'], 'affiliations': ['ARC Lab, Tencent PCG', 'Tsinghua University', 'VAST'], 'pdf_title_img': 'assets/pdf/title_img/2412.15200.jpg', 'data': {'categories': ['#architecture', '#diffusion', '#3d', '#games'], 'emoji': '🧠', 'ru': {'title': 'DI-PCG: Эффективная обратная генерация процедурного контента с помощью диффузионных трансформеров', 'desc': 'Статья представляет DI-PCG - новый эффективный метод обратной генерации процедурного контента (Inverse PCG) на основе изображений. В основе метода лежит легковесная модель диффузионного трансформера, которая напрямую генерирует параметры PCG, используя наблюдаемые изображения в качестве условий. DI-PCG демонстрирует превосходную производительность в точном восстановлении параметров и хорошо обобщается на реальные изображения. Метод предлагает перспективный подход к эффективной обратной PCG и представляет ценный шаг в исследовании пути генерации 3D-контента с использованием параметрических моделей.'}, 'en': {'title': 'Efficient Inverse PCG with DI-PCG: Transforming Images into 3D Shapes', 'desc': 'This paper introduces DI-PCG, a new method for Inverse Procedural Content Generation (PCG) that simplifies the process of generating 3D shapes from images. It utilizes a lightweight diffusion transformer model to directly link PCG parameters with observed images, making it easier to control the generation process. Unlike previous methods, DI-PCG requires fewer resources, with only 7.6 million parameters and 30 GPU hours for training, while still achieving high accuracy in parameter recovery. The results show that DI-PCG not only performs well in controlled settings but also generalizes effectively to real-world images, marking a significant advancement in 3D content creation.'}, 'zh': {'title': '高效逆程序内容生成的新方法', 'desc': '程序内容生成(PCG)在创建高质量3D内容方面非常强大,但控制其生成特定形状却很困难,通常需要大量的参数调优。逆程序内容生成旨在自动找到最佳参数以满足输入条件。现有的基于采样和神经网络的方法仍然面临许多样本迭代或可控性有限的问题。本文提出了一种新颖高效的逆PCG方法DI-PCG,利用轻量级扩散变换器模型,直接将PCG参数视为去噪目标,并将观察到的图像作为控制参数生成的条件。'}}}, {'id': 'https://huggingface.co/papers/2412.14233', 'title': 'Descriptive Caption Enhancement with Visual Specialists for Multimodal Perception', 'url': 'https://huggingface.co/papers/2412.14233', 'abstract': 'Training Large Multimodality Models (LMMs) relies on descriptive image caption that connects image and language. Existing methods either distill the caption from the LMM models or construct the captions from the internet images or by human. We propose to leverage off-the-shelf visual specialists, which were trained from annotated images initially not for image captioning, for enhancing the image caption. Our approach, named DCE, explores object low-level and fine-grained attributes (e.g., depth, emotion and fine-grained categories) and object relations (e.g., relative location and human-object-interaction (HOI)), and combine the attributes into the descriptive caption. Experiments demonstrate that such visual specialists are able to improve the performance for visual understanding tasks as well as reasoning that benefits from more accurate visual understanding. We will release the source code and the pipeline so that other visual specialists are easily combined into the pipeline. The complete source code of DCE pipeline and datasets will be available at https://github.com/syp2ysy/DCE.', 'score': 5, 'issue_id': 1228, 'pub_date': '2024-12-18', 'pub_date_card': {'ru': '18 декабря', 'en': 'December 18', 'zh': '12月18日'}, 'hash': '007f47cd739c576c', 'authors': ['Yanpeng Sun', 'Jing Hao', 'Ke Zhu', 'Jiang-Jiang Liu', 'Yuxiang Zhao', 'Xiaofan Li', 'Gang Zhang', 'Zechao Li', 'Jingdong Wang'], 'affiliations': ['Baidu VIS', 'Nanjing University', 'Nanjing University of Science and Technology', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2412.14233.jpg', 'data': {'categories': ['#cv', '#open_source', '#dataset', '#reasoning', '#multimodal', '#optimization'], 'emoji': '🖼️', 'ru': {'title': 'Улучшение мультимодальных моделей с помощью специализированного визуального анализа', 'desc': 'Статья представляет новый подход к обучению крупных мультимодальных моделей (LMM), используя специализированные визуальные модели для улучшения подписей к изображениям. Метод DCE исследует низкоуровневые и детальные атрибуты объектов, а также отношения между ними. Эксперименты показывают, что такой подход улучшает понимание визуальной информации и рассуждения на её основе. Авторы планируют опубликовать исходный код и pipeline для простой интеграции других визуальных специалистов.'}, 'en': {'title': 'Enhancing Image Captions with Visual Specialists', 'desc': 'This paper introduces a method called DCE that enhances image captions by utilizing existing visual specialists, which are models trained on annotated images for tasks other than captioning. DCE focuses on extracting low-level and fine-grained attributes of objects, such as depth and emotion, as well as their relationships, like location and interactions with humans. By integrating these detailed attributes into the captions, the method improves the performance of visual understanding and reasoning tasks. The authors plan to share their source code and datasets to facilitate the use of other visual specialists in this enhanced captioning process.'}, 'zh': {'title': '利用视觉专家提升图像描述质量', 'desc': '本文提出了一种新的方法,称为DCE,用于增强图像描述的质量。我们利用现成的视觉专家,这些专家最初是通过标注图像训练的,并不是专门用于图像描述。DCE方法探索了物体的低级和细粒度属性,以及物体之间的关系,并将这些属性结合到描述性标题中。实验表明,这种方法能够提高视觉理解任务的性能,并改善推理能力。'}}}, {'id': 'https://huggingface.co/papers/2412.15216', 'title': 'UIP2P: Unsupervised Instruction-based Image Editing via Cycle Edit Consistency', 'url': 'https://huggingface.co/papers/2412.15216', 'abstract': 'We propose an unsupervised model for instruction-based image editing that eliminates the need for ground-truth edited images during training. Existing supervised methods depend on datasets containing triplets of input image, edited image, and edit instruction. These are generated by either existing editing methods or human-annotations, which introduce biases and limit their generalization ability. Our method addresses these challenges by introducing a novel editing mechanism called Cycle Edit Consistency (CEC), which applies forward and backward edits in one training step and enforces consistency in image and attention spaces. This allows us to bypass the need for ground-truth edited images and unlock training for the first time on datasets comprising either real image-caption pairs or image-caption-edit triplets. We empirically show that our unsupervised technique performs better across a broader range of edits with high fidelity and precision. By eliminating the need for pre-existing datasets of triplets, reducing biases associated with supervised methods, and proposing CEC, our work represents a significant advancement in unblocking scaling of instruction-based image editing.', 'score': 4, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'ee62a21bee761d14', 'authors': ['Enis Simsar', 'Alessio Tonioni', 'Yongqin Xian', 'Thomas Hofmann', 'Federico Tombari'], 'affiliations': ['ETH Zurich', 'Google Switzerland', 'Technical University of Munich'], 'pdf_title_img': 'assets/pdf/title_img/2412.15216.jpg', 'data': {'categories': ['#cv', '#dataset', '#training'], 'emoji': '🖼️', 'ru': {'title': 'Редактирование изображений без учителя: новый подход к обучению на неразмеченных данных', 'desc': 'Авторы предлагают неконтролируемую модель для редактирования изображений на основе инструкций, которая устраняет необходимость в размеченных данных во время обучения. Метод вводит новый механизм редактирования под названием Cycle Edit Consistency (CEC), который применяет прямые и обратные правки в одном шаге обучения и обеспечивает согласованность в пространствах изображений и внимания. Это позволяет обучать модель на наборах данных, состоящих из пар изображение-подпись или триплетов изображение-подпись-правка, без необходимости в заранее отредактированных изображениях. Эмпирически показано, что предложенный неконтролируемый метод работает лучше для более широкого спектра правок с высокой точностью и достоверностью.'}, 'en': {'title': 'Revolutionizing Image Editing: Unsupervised Learning Without Ground Truth', 'desc': 'This paper presents an unsupervised model for instruction-based image editing that does not require ground-truth edited images for training. Traditional supervised methods rely on datasets with input images, edited images, and edit instructions, which can introduce biases and limit how well the model can generalize. The authors introduce a new editing mechanism called Cycle Edit Consistency (CEC), which allows for simultaneous forward and backward edits, ensuring consistency in both image and attention spaces. Their approach demonstrates improved performance across various edits, highlighting the potential for scaling instruction-based image editing without the constraints of existing datasets.'}, 'zh': {'title': '无监督图像编辑:打破传统限制', 'desc': '我们提出了一种无监督的基于指令的图像编辑模型,训练时不需要真实的编辑图像。现有的监督方法依赖于包含输入图像、编辑图像和编辑指令的三元组数据集,这些数据集可能引入偏差并限制了模型的泛化能力。我们的方法通过引入一种新的编辑机制——循环编辑一致性(CEC),在一个训练步骤中应用前向和后向编辑,从而在图像和注意力空间中强制一致性。我们的实验证明,这种无监督技术在更广泛的编辑任务中表现出更高的保真度和精确度,标志着基于指令的图像编辑的重大进展。'}}}, {'id': 'https://huggingface.co/papers/2412.14642', 'title': 'TOMG-Bench: Evaluating LLMs on Text-based Open Molecule Generation', 'url': 'https://huggingface.co/papers/2412.14642', 'abstract': 'In this paper, we propose Text-based Open Molecule Generation Benchmark (TOMG-Bench), the first benchmark to evaluate the open-domain molecule generation capability of LLMs. TOMG-Bench encompasses a dataset of three major tasks: molecule editing (MolEdit), molecule optimization (MolOpt), and customized molecule generation (MolCustom). Each task further contains three subtasks, with each subtask comprising 5,000 test samples. Given the inherent complexity of open molecule generation, we have also developed an automated evaluation system that helps measure both the quality and the accuracy of the generated molecules. Our comprehensive benchmarking of 25 LLMs reveals the current limitations and potential areas for improvement in text-guided molecule discovery. Furthermore, with the assistance of OpenMolIns, a specialized instruction tuning dataset proposed for solving challenges raised by TOMG-Bench, Llama3.1-8B could outperform all the open-source general LLMs, even surpassing GPT-3.5-turbo by 46.5\\% on TOMG-Bench. Our codes and datasets are available through https://github.com/phenixace/TOMG-Bench.', 'score': 4, 'issue_id': 1227, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'd6b4853faa2e7839', 'authors': ['Jiatong Li', 'Junxian Li', 'Yunqing Liu', 'Dongzhan Zhou', 'Qing Li'], 'affiliations': ['Shanghai AI Lab', 'Shanghai Jiao Tong University', 'The Hong Kong Polytechnic University'], 'pdf_title_img': 'assets/pdf/title_img/2412.14642.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#open_source', '#science'], 'emoji': '🧪', 'ru': {'title': 'Новый бенчмарк раскрывает потенциал языковых моделей в генерации молекул', 'desc': 'В статье представлен TOMG-Bench - первый бенчмарк для оценки способности языковых моделей (LLM) генерировать молекулы без ограничений. Бенчмарк включает три основные задачи: редактирование молекул, оптимизация молекул и генерация молекул по заданным параметрам. Авторы разработали автоматизированную систему оценки качества и точности сгенерированных молекул. Тестирование 25 языковых моделей выявило текущие ограничения и области для улучшения в области генерации молекул на основе текста.'}, 'en': {'title': 'Benchmarking LLMs for Molecule Generation Excellence', 'desc': 'This paper introduces TOMG-Bench, a benchmark designed to assess the ability of large language models (LLMs) in generating molecules. It includes three main tasks: molecule editing, optimization, and customized generation, each with multiple subtasks and a substantial dataset of test samples. An automated evaluation system is developed to measure the quality and accuracy of the generated molecules, highlighting the challenges in open-domain molecule generation. The study also shows that with the help of a specialized dataset for instruction tuning, Llama3.1-8B significantly outperforms other LLMs, indicating potential advancements in text-guided molecule discovery.'}, 'zh': {'title': '开放分子生成的新基准', 'desc': '本文提出了文本基础的开放分子生成基准(TOMG-Bench),这是第一个评估大型语言模型(LLM)在开放领域分子生成能力的基准。TOMG-Bench包含三个主要任务:分子编辑(MolEdit)、分子优化(MolOpt)和定制分子生成(MolCustom),每个任务下又有三个子任务,每个子任务包含5000个测试样本。为了应对开放分子生成的复杂性,我们开发了一个自动评估系统,以测量生成分子的质量和准确性。我们的基准测试显示了25个LLM的当前局限性和潜在改进领域,并且通过使用OpenMolIns数据集,Llama3.1-8B在TOMG-Bench上超越了所有开源通用LLM,甚至比GPT-3.5-turbo高出46.5%。'}}}, {'id': 'https://huggingface.co/papers/2412.15191', 'title': 'AV-Link: Temporally-Aligned Diffusion Features for Cross-Modal Audio-Video Generation', 'url': 'https://huggingface.co/papers/2412.15191', 'abstract': 'We propose AV-Link, a unified framework for Video-to-Audio and Audio-to-Video generation that leverages the activations of frozen video and audio diffusion models for temporally-aligned cross-modal conditioning. The key to our framework is a Fusion Block that enables bidirectional information exchange between our backbone video and audio diffusion models through a temporally-aligned self attention operation. Unlike prior work that uses feature extractors pretrained for other tasks for the conditioning signal, AV-Link can directly leverage features obtained by the complementary modality in a single framework i.e. video features to generate audio, or audio features to generate video. We extensively evaluate our design choices and demonstrate the ability of our method to achieve synchronized and high-quality audiovisual content, showcasing its potential for applications in immersive media generation. Project Page: snap-research.github.io/AVLink/', 'score': 3, 'issue_id': 1245, 'pub_date': '2024-12-19', 'pub_date_card': {'ru': '19 декабря', 'en': 'December 19', 'zh': '12月19日'}, 'hash': 'b6e778cce020ac78', 'authors': ['Moayed Haji-Ali', 'Willi Menapace', 'Aliaksandr Siarohin', 'Ivan Skorokhodov', 'Alper Canberk', 'Kwot Sin Lee', 'Vicente Ordonez', 'Sergey Tulyakov'], 'affiliations': ['Rice University', 'Snap Inc'], 'pdf_title_img': 'assets/pdf/title_img/2412.15191.jpg', 'data': {'categories': ['#diffusion', '#multimodal', '#video', '#audio'], 'emoji': '🎭', 'ru': {'title': 'Единая система для двунаправленной генерации аудио и видео', 'desc': 'AV-Link - это унифицированная система для генерации видео по аудио и аудио по видео, использующая активации замороженных диффузионных моделей для кросс-модального обусловливания. Ключевым элементом является Fusion Block, позволяющий осуществлять двунаправленный обмен информацией между базовыми видео- и аудиомоделями через операцию самовнимания. В отличие от предыдущих подходов, AV-Link напрямую использует признаки, полученные из комплементарной модальности. Система демонстрирует возможность создания синхронизированного и качественного аудиовизуального контента.'}, 'en': {'title': 'Bridging Audio and Video with AV-Link', 'desc': 'AV-Link is a new framework that allows for the generation of audio from video and vice versa by using advanced diffusion models. It features a Fusion Block that facilitates the exchange of information between video and audio models, ensuring that they are temporally aligned. This approach is different from previous methods, as it directly uses features from one modality to enhance the generation of the other, rather than relying on pre-trained models. The results show that AV-Link can create synchronized and high-quality audiovisual content, making it valuable for immersive media applications.'}, 'zh': {'title': 'AV-Link:视频与音频的无缝生成框架', 'desc': '我们提出了AV-Link,这是一个统一的框架,用于视频到音频和音频到视频的生成。该框架利用冻结的视频和音频扩散模型的激活,进行时间对齐的跨模态条件处理。我们的关键组件是融合块,它通过时间对齐的自注意力操作,实现视频和音频扩散模型之间的信息双向交换。与之前的工作不同,AV-Link可以直接利用互补模态获得的特征,生成高质量的同步视听内容,展示了其在沉浸式媒体生成中的潜力。'}}}, {'id': 'https://huggingface.co/papers/2412.14283', 'title': 'PixelMan: Consistent Object Editing with Diffusion Models via Pixel Manipulation and Generation', 'url': 'https://huggingface.co/papers/2412.14283', 'abstract': 'Recent research explores the potential of Diffusion Models (DMs) for consistent object editing, which aims to modify object position, size, and composition, etc., while preserving the consistency of objects and background without changing their texture and attributes. Current inference-time methods often rely on DDIM inversion, which inherently compromises efficiency and the achievable consistency of edited images. Recent methods also utilize energy guidance which iteratively updates the predicted noise and can drive the latents away from the original image, resulting in distortions. In this paper, we propose PixelMan, an inversion-free and training-free method for achieving consistent object editing via Pixel Manipulation and generation, where we directly create a duplicate copy of the source object at target location in the pixel space, and introduce an efficient sampling approach to iteratively harmonize the manipulated object into the target location and inpaint its original location, while ensuring image consistency by anchoring the edited image to be generated to the pixel-manipulated image as well as by introducing various consistency-preserving optimization techniques during inference. Experimental evaluations based on benchmark datasets as well as extensive visual comparisons show that in as few as 16 inference steps, PixelMan outperforms a range of state-of-the-art training-based and training-free methods (usually requiring 50 steps) on multiple consistent object editing tasks.', 'score': 3, 'issue_id': 1239, 'pub_date': '2024-12-18', 'pub_date_card': {'ru': '18 декабря', 'en': 'December 18', 'zh': '12月18日'}, 'hash': 'c862b26c5d60f9eb', 'authors': ['Liyao Jiang', 'Negar Hassanpour', 'Mohammad Salameh', 'Mohammadreza Samadi', 'Jiao He', 'Fengyu Sun', 'Di Niu'], 'affiliations': ['Huawei Kirin Solution, China', 'Huawei Technologies Canada', 'University of Alberta'], 'pdf_title_img': 'assets/pdf/title_img/2412.14283.jpg', 'data': {'categories': ['#inference', '#cv', '#benchmark', '#optimization', '#diffusion'], 'emoji': '🖼️', 'ru': {'title': 'PixelMan: эффективное редактирование объектов без инверсии и дообучения', 'desc': 'Статья представляет PixelMan - новый метод для редактирования объектов на изображениях с помощью диффузионных моделей. В отличие от существующих подходов, PixelMan не требует инверсии или дополнительного обучения модели. Метод работает путем создания копии объекта в пиксельном пространстве и последующей гармонизации с помощью эффективного сэмплирования. PixelMan превосходит современные методы по качеству результатов и эффективности, требуя всего 16 шагов вывода.'}, 'en': {'title': 'PixelMan: Efficient and Consistent Object Editing Made Easy', 'desc': 'This paper introduces PixelMan, a novel method for consistent object editing using Diffusion Models (DMs). Unlike traditional methods that rely on DDIM inversion, PixelMan operates without the need for inversion or training, allowing for more efficient and consistent edits. The approach involves directly manipulating pixels to create a duplicate of the source object at a desired location while maintaining the integrity of the original image. Experimental results demonstrate that PixelMan achieves superior performance in fewer steps compared to existing state-of-the-art methods, making it a significant advancement in the field of image editing.'}, 'zh': {'title': 'PixelMan:高效一致性物体编辑的新方法', 'desc': '本研究探讨了扩散模型在一致性物体编辑中的潜力,旨在在不改变物体纹理和属性的情况下,修改物体的位置、大小和组成等。当前的推理方法通常依赖于DDIM反演,这会影响编辑图像的效率和一致性。我们提出了PixelMan,这是一种无反演和无训练的方法,通过像素操作和生成实现一致性物体编辑。实验结果表明,PixelMan在多个一致性物体编辑任务中,经过仅16次推理步骤,超越了多种最先进的训练和无训练方法。'}}}, {'id': 'https://huggingface.co/papers/2412.13377', 'title': 'DateLogicQA: Benchmarking Temporal Biases in Large Language Models', 'url': 'https://huggingface.co/papers/2412.13377', 'abstract': "This paper introduces DateLogicQA, a benchmark with 190 questions covering diverse date formats, temporal contexts, and reasoning types. We propose the Semantic Integrity Metric to assess tokenization quality and analyse two biases: Representation-Level Bias, affecting embeddings, and Logical-Level Bias, influencing reasoning outputs. Our findings provide a comprehensive evaluation of LLMs' capabilities and limitations in temporal reasoning, highlighting key challenges in handling temporal data accurately. The GitHub repository for our work is available at https://github.com/gagan3012/EAIS-Temporal-Bias", 'score': 2, 'issue_id': 1238, 'pub_date': '2024-12-17', 'pub_date_card': {'ru': '17 декабря', 'en': 'December 17', 'zh': '12月17日'}, 'hash': '2a984597afc42f8d', 'authors': ['Gagan Bhatia', 'MingZe Tang', 'Cristina Mahanta', 'Madiha Kazi'], 'affiliations': ['University of Aberdeen'], 'pdf_title_img': 'assets/pdf/title_img/2412.13377.jpg', 'data': {'categories': ['#benchmark', '#multimodal', '#reasoning', '#interpretability', '#data'], 'emoji': '🗓️', 'ru': {'title': 'DateLogicQA: новый бенчмарк для оценки временных рассуждений в языковых моделях', 'desc': 'Статья представляет DateLogicQA - набор данных из 190 вопросов, охватывающих различные форматы дат, временные контексты и типы рассуждений. Авторы предлагают метрику семантической целостности для оценки качества токенизации. Они анализируют два вида смещения: смещение на уровне представления, влияющее на эмбеддинги, и смещение на логическом уровне, влияющее на результаты рассуждений. Исследование предоставляет комплексную оценку возможностей и ограничений языковых моделей в области временных рассуждений.'}, 'en': {'title': 'Evaluating Temporal Reasoning in LLMs with DateLogicQA', 'desc': 'This paper presents DateLogicQA, a new benchmark consisting of 190 questions that test various date formats and reasoning about time. It introduces the Semantic Integrity Metric to evaluate how well tokenization preserves meaning in temporal contexts. The authors identify two types of biases: Representation-Level Bias, which affects how embeddings are formed, and Logical-Level Bias, which impacts the reasoning outputs of models. The study reveals important insights into the strengths and weaknesses of large language models (LLMs) in understanding and reasoning with temporal data.'}, 'zh': {'title': '时间推理的新基准与挑战', 'desc': '本文介绍了DateLogicQA,这是一个包含190个问题的基准,涵盖了多种日期格式、时间上下文和推理类型。我们提出了语义完整性度量,用于评估标记化质量,并分析了两种偏差:表示级偏差,影响嵌入;逻辑级偏差,影响推理输出。我们的研究结果全面评估了大型语言模型在时间推理方面的能力和局限性,突出了准确处理时间数据的关键挑战。'}}}, {'id': 'https://huggingface.co/papers/2412.13185', 'title': 'Move-in-2D: 2D-Conditioned Human Motion Generation', 'url': 'https://huggingface.co/papers/2412.13185', 'abstract': 'Generating realistic human videos remains a challenging task, with the most effective methods currently relying on a human motion sequence as a control signal. Existing approaches often use existing motion extracted from other videos, which restricts applications to specific motion types and global scene matching. We propose Move-in-2D, a novel approach to generate human motion sequences conditioned on a scene image, allowing for diverse motion that adapts to different scenes. Our approach utilizes a diffusion model that accepts both a scene image and text prompt as inputs, producing a motion sequence tailored to the scene. To train this model, we collect a large-scale video dataset featuring single-human activities, annotating each video with the corresponding human motion as the target output. Experiments demonstrate that our method effectively predicts human motion that aligns with the scene image after projection. Furthermore, we show that the generated motion sequence improves human motion quality in video synthesis tasks.', 'score': 1, 'issue_id': 1235, 'pub_date': '2024-12-17', 'pub_date_card': {'ru': '17 декабря', 'en': 'December 17', 'zh': '12月17日'}, 'hash': '0550a2936389fd19', 'authors': ['Hsin-Ping Huang', 'Yang Zhou', 'Jui-Hsien Wang', 'Difan Liu', 'Feng Liu', 'Ming-Hsuan Yang', 'Zhan Xu'], 'affiliations': ['Adobe Research', 'University of California, Merced'], 'pdf_title_img': 'assets/pdf/title_img/2412.13185.jpg', 'data': {'categories': ['#video', '#games', '#multimodal', '#diffusion', '#dataset'], 'emoji': '🎥', 'ru': {'title': 'Генерация реалистичных движений человека на основе изображения сцены', 'desc': 'Статья представляет новый подход к генерации реалистичных видео с людьми под названием Move-in-2D. Метод использует диффузионную модель, которая принимает изображение сцены и текстовый запрос в качестве входных данных для создания последовательности движений человека, адаптированной к сцене. Для обучения модели был собран масштабный набор данных видео с одиночными человеческими действиями, где каждое видео аннотировано соответствующим движением человека. Эксперименты показывают, что метод эффективно предсказывает движения человека, соответствующие изображению сцены, и улучшает качество движений в задачах синтеза видео.'}, 'en': {'title': 'Dynamic Motion Generation Tailored to Any Scene', 'desc': 'This paper introduces Move-in-2D, a new method for generating realistic human motion sequences based on scene images. Unlike previous methods that rely on pre-existing motion data, our approach allows for a wider variety of motions that can adapt to different environments. We utilize a diffusion model that takes both a scene image and a text prompt to create customized motion sequences. Our experiments show that this method not only aligns the generated motion with the scene but also enhances the overall quality of human motion in video synthesis tasks.'}, 'zh': {'title': '根据场景生成多样化人类运动序列', 'desc': '生成逼真的人类视频仍然是一个具有挑战性的任务,目前最有效的方法通常依赖于人类运动序列作为控制信号。现有的方法通常使用从其他视频中提取的运动,这限制了应用于特定运动类型和全局场景匹配。我们提出了Move-in-2D,这是一种新颖的方法,可以根据场景图像生成适应不同场景的人类运动序列。我们的模型利用扩散模型,接受场景图像和文本提示作为输入,生成与场景相匹配的运动序列。'}}}];
const articlesContainer = document.getElementById('articles-container');
const sortDropdown = document.getElementById('sort-dropdown');
const categoryFiltersContainer = document.getElementById('category-filters');
const categoryFiltersLogicOptions = document.getElementById('category-options');
const categoryToggle = document.getElementById('category-toggle');
const clearCategoriesButton = document.getElementById('clear-categories');
let selectedCategories = [];
let selectedArticles = [];
let sortBy = 'issue_id';
let showLimitHint = false;
let filterLogicIsAnd = false;
function getUrlParameters() {
const urlParams = new URLSearchParams(window.location.search);
const categoriesParam = urlParams.get('cat');
let categories = categoriesParam ? categoriesParam.split(',') : [];
categories = categories.map(element => `#${element}`);
return categories
}
function updateUrlWithCategories() {
let cleanedCategories = selectedCategories.map(element => element.replace(/^#/, ''));
const newUrl = cleanedCategories.length > 0
? `${window.location.pathname}?cat=${cleanedCategories.join(',')}`
: window.location.pathname;
console.log("cleanedCategories", cleanedCategories)
window.history.pushState({}, '', newUrl);
}
function loadSettings() {
const themeToggle = document.getElementById('theme-toggle');
const sortDropdown = document.getElementById('sort-dropdown');
const isDarkMode = localStorage.getItem('darkMode') === 'true';
let settingSortBy = localStorage.getItem('sort_by');
filterLogicIsAnd = localStorage.getItem('filter_logic_is_and') === 'true';
if (isDarkMode) {
document.body.classList.remove('light-theme');
document.body.classList.add('dark-theme');
themeToggle.checked = true;
const title = document.getElementById('doomgrad');
title.innerHTML = "hf nightly";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.add('rotate');
}
if ((!settingSortBy) || (settingSortBy === 'null')) {
settingSortBy = 'issue_id';
}
if (filterLogicIsAnd) {
document.getElementById('filter-logic-and').checked = true;
} else {
document.getElementById('filter-logic-or').checked = true;
}
sortDropdown.value = settingSortBy;
sortBy = settingSortBy;
}
document.getElementById('theme-toggle').addEventListener('change', toggleTheme);
document.getElementById('filter-logic-and').addEventListener('change', () => {
filterLogicIsAnd = true;
localStorage.setItem('filter_logic_is_and', 'true');
filterAndRenderArticles();
updateSelectedArticlesTitle();
});
document.getElementById('filter-logic-or').addEventListener('change', () => {
filterLogicIsAnd = false;
localStorage.setItem('filter_logic_is_and', 'false');
filterAndRenderArticles();
updateSelectedArticlesTitle();
});
function getUniqueCategories(articles) {
const categories = new Set();
articles.forEach(article => {
if (article.data && article.data.categories) {
article.data.categories.forEach(cat => categories.add(cat));
}
});
let res = Array.from(categories);
res.sort();
return res;
}
function createCategoryButtons() {
//const categories = getUniqueCategories(articlesData);
const categories = ['#3d (2)', '#agents', '#agi (1)', '#alignment (1)', '#architecture (4)', '#audio (1)', '#benchmark (8)', '#cv (5)', '#data (3)', '#dataset (10)', '#diffusion (7)', '#ethics', '#games (2)', '#graphs', '#hallucinations', '#healthcare', '#inference (1)', '#interpretability (1)', '#leakage', '#long_context (1)', '#low_resource', '#machine_translation', '#math (1)', '#multilingual', '#multimodal (8)', '#open_source (5)', '#optimization (5)', '#plp', '#rag', '#reasoning (5)', '#rl', '#rlhf', '#robotics', '#science (1)', '#security', '#small_models', '#story_generation', '#survey', '#synthetic (4)', '#training (5)', '#transfer_learning', '#video (3)'];
categories.forEach(category => {
let catNameSplitted = category.split(/(\s+)/);
let catName = catNameSplitted[0];
const button = document.createElement('span');
button.textContent = catName;
button.className = 'category-button';
if (catNameSplitted.length < 2) {
button.classList.add('inactive');
};
button.onclick = () => toggleCategory(catName, button);
categoryFiltersContainer.appendChild(button);
});
}
function toggleCategory(category, button) {
const index = selectedCategories.indexOf(category);
if (index === -1) {
selectedCategories.push(category);
button.classList.add('active');
} else {
selectedCategories.splice(index, 1);
button.classList.remove('active');
}
filterAndRenderArticles();
saveCategorySelection();