| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 45.0, |
| "eval_steps": 500, |
| "global_step": 158130, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.14228799089356858, |
| "grad_norm": 0.6334199905395508, |
| "learning_rate": 0.0001996, |
| "loss": 7.48322509765625, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.28457598178713717, |
| "grad_norm": 1.0710968971252441, |
| "learning_rate": 0.0003996, |
| "loss": 6.2563916015625, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.42686397268070575, |
| "grad_norm": 0.6296343207359314, |
| "learning_rate": 0.0005996, |
| "loss": 6.11807275390625, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5691519635742743, |
| "grad_norm": 0.9468093514442444, |
| "learning_rate": 0.0007996, |
| "loss": 6.00693505859375, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7114399544678429, |
| "grad_norm": 1.3960775136947632, |
| "learning_rate": 0.0009996, |
| "loss": 5.5765498046875, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8537279453614115, |
| "grad_norm": 1.1396394968032837, |
| "learning_rate": 0.0009967936773115724, |
| "loss": 4.27981396484375, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9960159362549801, |
| "grad_norm": 1.1924173831939697, |
| "learning_rate": 0.000993580929126775, |
| "loss": 3.510773193359375, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.1383039271485487, |
| "grad_norm": 1.1864285469055176, |
| "learning_rate": 0.0009903681809419777, |
| "loss": 3.184400390625, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.2805919180421172, |
| "grad_norm": 0.9659832119941711, |
| "learning_rate": 0.0009871554327571805, |
| "loss": 2.9972861328125, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.4228799089356858, |
| "grad_norm": 1.0554136037826538, |
| "learning_rate": 0.0009839426845723832, |
| "loss": 2.87672265625, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.5651678998292544, |
| "grad_norm": 0.989261269569397, |
| "learning_rate": 0.000980729936387586, |
| "loss": 2.784982177734375, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.707455890722823, |
| "grad_norm": 1.0604394674301147, |
| "learning_rate": 0.0009775171882027887, |
| "loss": 2.708709716796875, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.8497438816163916, |
| "grad_norm": 0.9650368094444275, |
| "learning_rate": 0.0009743044400179915, |
| "loss": 2.65090625, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9920318725099602, |
| "grad_norm": 0.9355012774467468, |
| "learning_rate": 0.0009710916918331941, |
| "loss": 2.607532470703125, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.1343198634035287, |
| "grad_norm": 0.962996780872345, |
| "learning_rate": 0.0009678789436483969, |
| "loss": 2.54883447265625, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.2766078542970973, |
| "grad_norm": 1.0536885261535645, |
| "learning_rate": 0.0009646661954635996, |
| "loss": 2.514782958984375, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.418895845190666, |
| "grad_norm": 1.0663453340530396, |
| "learning_rate": 0.0009614534472788023, |
| "loss": 2.487317138671875, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.5611838360842345, |
| "grad_norm": 0.9626434445381165, |
| "learning_rate": 0.000958240699094005, |
| "loss": 2.468934814453125, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.703471826977803, |
| "grad_norm": 0.9301921129226685, |
| "learning_rate": 0.0009550279509092078, |
| "loss": 2.447890380859375, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.8457598178713717, |
| "grad_norm": 1.0070905685424805, |
| "learning_rate": 0.0009518152027244105, |
| "loss": 2.418142333984375, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9880478087649402, |
| "grad_norm": 0.9236523509025574, |
| "learning_rate": 0.0009486024545396133, |
| "loss": 2.40807373046875, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.130335799658509, |
| "grad_norm": 0.9421653747558594, |
| "learning_rate": 0.0009453897063548159, |
| "loss": 2.368919677734375, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2726237905520774, |
| "grad_norm": 0.9248610138893127, |
| "learning_rate": 0.0009421769581700187, |
| "loss": 2.36261572265625, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.414911781445646, |
| "grad_norm": 0.9469349384307861, |
| "learning_rate": 0.0009389642099852214, |
| "loss": 2.3440390625, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.5571997723392146, |
| "grad_norm": 1.0009181499481201, |
| "learning_rate": 0.0009357514618004241, |
| "loss": 2.32732666015625, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.699487763232783, |
| "grad_norm": 0.9607020020484924, |
| "learning_rate": 0.0009325387136156268, |
| "loss": 2.313716552734375, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.8417757541263517, |
| "grad_norm": 0.9231477975845337, |
| "learning_rate": 0.0009293259654308296, |
| "loss": 2.30414453125, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.9840637450199203, |
| "grad_norm": 0.9701341986656189, |
| "learning_rate": 0.0009261132172460323, |
| "loss": 2.2853681640625, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.126351735913489, |
| "grad_norm": 0.9349779486656189, |
| "learning_rate": 0.000922900469061235, |
| "loss": 2.26698486328125, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.2686397268070575, |
| "grad_norm": 0.9157093167304993, |
| "learning_rate": 0.0009196877208764377, |
| "loss": 2.256426513671875, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.410927717700626, |
| "grad_norm": 0.9236478209495544, |
| "learning_rate": 0.0009164749726916405, |
| "loss": 2.25071337890625, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.553215708594195, |
| "grad_norm": 0.9309902191162109, |
| "learning_rate": 0.0009132622245068431, |
| "loss": 2.2407861328125, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.695503699487763, |
| "grad_norm": 0.9968065619468689, |
| "learning_rate": 0.000910049476322046, |
| "loss": 2.232171142578125, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.837791690381332, |
| "grad_norm": 0.893164336681366, |
| "learning_rate": 0.0009068367281372486, |
| "loss": 2.223586669921875, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.9800796812749, |
| "grad_norm": 0.9570403695106506, |
| "learning_rate": 0.0009036239799524514, |
| "loss": 2.2169453125, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.122367672168469, |
| "grad_norm": 0.9152570962905884, |
| "learning_rate": 0.0009004112317676541, |
| "loss": 2.1965634765625, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.264655663062038, |
| "grad_norm": 0.9538166522979736, |
| "learning_rate": 0.0008971984835828568, |
| "loss": 2.18993994140625, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.406943653955606, |
| "grad_norm": 0.8849217891693115, |
| "learning_rate": 0.0008939857353980595, |
| "loss": 2.179218994140625, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.549231644849175, |
| "grad_norm": 0.9768293499946594, |
| "learning_rate": 0.0008907729872132623, |
| "loss": 2.174207763671875, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.691519635742743, |
| "grad_norm": 0.9277469515800476, |
| "learning_rate": 0.0008875602390284649, |
| "loss": 2.1700234375, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.833807626636312, |
| "grad_norm": 0.9375103116035461, |
| "learning_rate": 0.0008843474908436677, |
| "loss": 2.166572998046875, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.9760956175298805, |
| "grad_norm": 0.9594961404800415, |
| "learning_rate": 0.0008811347426588704, |
| "loss": 2.15810888671875, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.118383608423449, |
| "grad_norm": 0.8761999607086182, |
| "learning_rate": 0.0008779219944740731, |
| "loss": 2.13630029296875, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.260671599317018, |
| "grad_norm": 0.8995809555053711, |
| "learning_rate": 0.0008747092462892759, |
| "loss": 2.13485693359375, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.402959590210586, |
| "grad_norm": 0.9557548761367798, |
| "learning_rate": 0.0008714964981044786, |
| "loss": 2.12809130859375, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.545247581104155, |
| "grad_norm": 0.8720033168792725, |
| "learning_rate": 0.0008682837499196813, |
| "loss": 2.12343359375, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.687535571997723, |
| "grad_norm": 0.9293733239173889, |
| "learning_rate": 0.0008650710017348841, |
| "loss": 2.121170166015625, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.829823562891292, |
| "grad_norm": 0.9301312565803528, |
| "learning_rate": 0.0008618582535500867, |
| "loss": 2.115037841796875, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.972111553784861, |
| "grad_norm": 0.9322926998138428, |
| "learning_rate": 0.0008586455053652895, |
| "loss": 2.11170068359375, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.114399544678429, |
| "grad_norm": 0.9266783595085144, |
| "learning_rate": 0.0008554327571804922, |
| "loss": 2.0983544921875, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.256687535571998, |
| "grad_norm": 0.9406460523605347, |
| "learning_rate": 0.0008522200089956949, |
| "loss": 2.08985888671875, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.398975526465566, |
| "grad_norm": 0.923007607460022, |
| "learning_rate": 0.0008490072608108977, |
| "loss": 2.093153564453125, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.541263517359135, |
| "grad_norm": 0.9110872149467468, |
| "learning_rate": 0.0008457945126261004, |
| "loss": 2.080158447265625, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.6835515082527035, |
| "grad_norm": 0.9028299450874329, |
| "learning_rate": 0.000842581764441303, |
| "loss": 2.0810068359375, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.825839499146272, |
| "grad_norm": 0.9071224927902222, |
| "learning_rate": 0.0008393690162565059, |
| "loss": 2.08135595703125, |
| "step": 27500 |
| }, |
| { |
| "epoch": 7.968127490039841, |
| "grad_norm": 0.8702104687690735, |
| "learning_rate": 0.0008361562680717085, |
| "loss": 2.073836181640625, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.110415480933408, |
| "grad_norm": 0.9875132441520691, |
| "learning_rate": 0.0008329435198869114, |
| "loss": 2.06207470703125, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.252703471826978, |
| "grad_norm": 0.8733546733856201, |
| "learning_rate": 0.000829730771702114, |
| "loss": 2.0590986328125, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.394991462720546, |
| "grad_norm": 0.8623999357223511, |
| "learning_rate": 0.0008265180235173167, |
| "loss": 2.05387744140625, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.537279453614115, |
| "grad_norm": 0.9093107581138611, |
| "learning_rate": 0.0008233052753325195, |
| "loss": 2.049600830078125, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.679567444507683, |
| "grad_norm": 0.8935321569442749, |
| "learning_rate": 0.0008200925271477222, |
| "loss": 2.04915966796875, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.821855435401252, |
| "grad_norm": 0.9517919421195984, |
| "learning_rate": 0.0008168797789629248, |
| "loss": 2.044854248046875, |
| "step": 31000 |
| }, |
| { |
| "epoch": 8.964143426294822, |
| "grad_norm": 0.924595057964325, |
| "learning_rate": 0.0008136670307781277, |
| "loss": 2.0417685546875, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.10643141718839, |
| "grad_norm": 0.8856834173202515, |
| "learning_rate": 0.0008104542825933303, |
| "loss": 2.0327542724609375, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.248719408081957, |
| "grad_norm": 0.851649820804596, |
| "learning_rate": 0.000807241534408533, |
| "loss": 2.026624267578125, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.391007398975526, |
| "grad_norm": 0.8763725161552429, |
| "learning_rate": 0.0008040287862237358, |
| "loss": 2.023781982421875, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.533295389869096, |
| "grad_norm": 0.9054471850395203, |
| "learning_rate": 0.0008008160380389385, |
| "loss": 2.0217911376953124, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.675583380762664, |
| "grad_norm": 0.9936219453811646, |
| "learning_rate": 0.0007976032898541413, |
| "loss": 2.0223516845703124, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.817871371656231, |
| "grad_norm": 0.8445395231246948, |
| "learning_rate": 0.000794390541669344, |
| "loss": 2.0150150146484376, |
| "step": 34500 |
| }, |
| { |
| "epoch": 9.9601593625498, |
| "grad_norm": 0.8997848629951477, |
| "learning_rate": 0.0007911777934845466, |
| "loss": 2.0144486083984376, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.102447353443369, |
| "grad_norm": 0.9022369384765625, |
| "learning_rate": 0.0007879650452997495, |
| "loss": 2.0057178955078125, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.244735344336938, |
| "grad_norm": 0.8679398894309998, |
| "learning_rate": 0.0007847522971149521, |
| "loss": 1.996031005859375, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.387023335230506, |
| "grad_norm": 0.8584362864494324, |
| "learning_rate": 0.0007815395489301548, |
| "loss": 1.9979686279296875, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.529311326124075, |
| "grad_norm": 0.919893741607666, |
| "learning_rate": 0.0007783268007453576, |
| "loss": 1.9953302001953126, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.671599317017645, |
| "grad_norm": 0.8829244375228882, |
| "learning_rate": 0.0007751140525605603, |
| "loss": 1.9904837646484375, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.813887307911212, |
| "grad_norm": 0.8700593113899231, |
| "learning_rate": 0.000771901304375763, |
| "loss": 1.9878653564453126, |
| "step": 38000 |
| }, |
| { |
| "epoch": 10.95617529880478, |
| "grad_norm": 0.8948189616203308, |
| "learning_rate": 0.0007686885561909658, |
| "loss": 1.9861346435546876, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.09846328969835, |
| "grad_norm": 0.8743970990180969, |
| "learning_rate": 0.0007654758080061684, |
| "loss": 1.9781466064453126, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.240751280591917, |
| "grad_norm": 0.8927684426307678, |
| "learning_rate": 0.0007622630598213713, |
| "loss": 1.9800474853515626, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.383039271485487, |
| "grad_norm": 0.8602464199066162, |
| "learning_rate": 0.0007590503116365739, |
| "loss": 1.969161865234375, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.525327262379054, |
| "grad_norm": 0.9330834746360779, |
| "learning_rate": 0.0007558375634517766, |
| "loss": 1.9744647216796876, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.667615253272624, |
| "grad_norm": 0.8487387895584106, |
| "learning_rate": 0.0007526248152669794, |
| "loss": 1.9631361083984376, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.809903244166193, |
| "grad_norm": 0.8121556639671326, |
| "learning_rate": 0.0007494120670821821, |
| "loss": 1.9670977783203125, |
| "step": 41500 |
| }, |
| { |
| "epoch": 11.952191235059761, |
| "grad_norm": 0.8926526308059692, |
| "learning_rate": 0.0007461993188973848, |
| "loss": 1.962933349609375, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.094479225953329, |
| "grad_norm": 0.8671021461486816, |
| "learning_rate": 0.0007429865707125876, |
| "loss": 1.9563138427734375, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.236767216846898, |
| "grad_norm": 0.8316618800163269, |
| "learning_rate": 0.0007397738225277902, |
| "loss": 1.9496668701171875, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.379055207740466, |
| "grad_norm": 0.8220306038856506, |
| "learning_rate": 0.000736561074342993, |
| "loss": 1.9505400390625, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.521343198634035, |
| "grad_norm": 0.882720410823822, |
| "learning_rate": 0.0007333483261581957, |
| "loss": 1.9462012939453126, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.663631189527603, |
| "grad_norm": 0.8841201663017273, |
| "learning_rate": 0.0007301355779733984, |
| "loss": 1.94572216796875, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.805919180421172, |
| "grad_norm": 0.8917742967605591, |
| "learning_rate": 0.0007269228297886012, |
| "loss": 1.9462489013671875, |
| "step": 45000 |
| }, |
| { |
| "epoch": 12.948207171314742, |
| "grad_norm": 0.8612926006317139, |
| "learning_rate": 0.0007237100816038039, |
| "loss": 1.9503948974609375, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.09049516220831, |
| "grad_norm": 0.9925726652145386, |
| "learning_rate": 0.0007204973334190066, |
| "loss": 1.9372962646484375, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.232783153101877, |
| "grad_norm": 0.8212889432907104, |
| "learning_rate": 0.0007172845852342094, |
| "loss": 1.93100146484375, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.375071143995447, |
| "grad_norm": 0.8393476605415344, |
| "learning_rate": 0.000714071837049412, |
| "loss": 1.9355791015625, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.517359134889016, |
| "grad_norm": 0.876530647277832, |
| "learning_rate": 0.0007108590888646148, |
| "loss": 1.929031982421875, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.659647125782584, |
| "grad_norm": 0.9148900508880615, |
| "learning_rate": 0.0007076463406798175, |
| "loss": 1.9321796875, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.801935116676152, |
| "grad_norm": 0.8540393114089966, |
| "learning_rate": 0.0007044335924950202, |
| "loss": 1.9232811279296875, |
| "step": 48500 |
| }, |
| { |
| "epoch": 13.944223107569721, |
| "grad_norm": 0.874427080154419, |
| "learning_rate": 0.0007012208443102229, |
| "loss": 1.920580078125, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.086511098463289, |
| "grad_norm": 0.9040400385856628, |
| "learning_rate": 0.0006980080961254257, |
| "loss": 1.912923095703125, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.228799089356858, |
| "grad_norm": 0.9279779195785522, |
| "learning_rate": 0.0006947953479406284, |
| "loss": 1.9119281005859374, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.371087080250426, |
| "grad_norm": 0.8857927322387695, |
| "learning_rate": 0.0006915825997558312, |
| "loss": 1.907870361328125, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.513375071143995, |
| "grad_norm": 0.8245786428451538, |
| "learning_rate": 0.0006883698515710338, |
| "loss": 1.9115518798828126, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.655663062037565, |
| "grad_norm": 0.8284105658531189, |
| "learning_rate": 0.0006851571033862366, |
| "loss": 1.91026611328125, |
| "step": 51500 |
| }, |
| { |
| "epoch": 14.797951052931133, |
| "grad_norm": 0.8700644969940186, |
| "learning_rate": 0.0006819443552014393, |
| "loss": 1.9089705810546875, |
| "step": 52000 |
| }, |
| { |
| "epoch": 14.9402390438247, |
| "grad_norm": 0.8545106649398804, |
| "learning_rate": 0.000678731607016642, |
| "loss": 1.9038966064453124, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.08252703471827, |
| "grad_norm": 0.8972774744033813, |
| "learning_rate": 0.0006755188588318448, |
| "loss": 1.9033292236328125, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.224815025611838, |
| "grad_norm": 0.8351185321807861, |
| "learning_rate": 0.0006723061106470475, |
| "loss": 1.8937845458984375, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.367103016505407, |
| "grad_norm": 0.8219364285469055, |
| "learning_rate": 0.0006690933624622503, |
| "loss": 1.8965240478515626, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.509391007398975, |
| "grad_norm": 0.8361454606056213, |
| "learning_rate": 0.0006658806142774529, |
| "loss": 1.89189599609375, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.651678998292544, |
| "grad_norm": 0.8974409699440002, |
| "learning_rate": 0.0006626678660926557, |
| "loss": 1.89478369140625, |
| "step": 55000 |
| }, |
| { |
| "epoch": 15.793966989186112, |
| "grad_norm": 0.880893886089325, |
| "learning_rate": 0.0006594551179078584, |
| "loss": 1.8854979248046875, |
| "step": 55500 |
| }, |
| { |
| "epoch": 15.936254980079681, |
| "grad_norm": 0.8354572057723999, |
| "learning_rate": 0.0006562423697230612, |
| "loss": 1.884849853515625, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.07854297097325, |
| "grad_norm": 0.8348339200019836, |
| "learning_rate": 0.0006530296215382638, |
| "loss": 1.8829962158203124, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.220830961866817, |
| "grad_norm": 0.828911304473877, |
| "learning_rate": 0.0006498168733534666, |
| "loss": 1.8815081787109376, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.363118952760388, |
| "grad_norm": 0.8096295595169067, |
| "learning_rate": 0.0006466041251686693, |
| "loss": 1.87695361328125, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.505406943653956, |
| "grad_norm": 0.8548514246940613, |
| "learning_rate": 0.0006433913769838721, |
| "loss": 1.8754146728515626, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.647694934547523, |
| "grad_norm": 0.8417186737060547, |
| "learning_rate": 0.0006401786287990747, |
| "loss": 1.8718682861328124, |
| "step": 58500 |
| }, |
| { |
| "epoch": 16.78998292544109, |
| "grad_norm": 0.8816096186637878, |
| "learning_rate": 0.0006369658806142775, |
| "loss": 1.876389404296875, |
| "step": 59000 |
| }, |
| { |
| "epoch": 16.932270916334662, |
| "grad_norm": 0.8592162728309631, |
| "learning_rate": 0.0006337531324294802, |
| "loss": 1.8765806884765626, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.07455890722823, |
| "grad_norm": 0.8253895044326782, |
| "learning_rate": 0.0006305403842446829, |
| "loss": 1.862882568359375, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.216846898121798, |
| "grad_norm": 0.848976731300354, |
| "learning_rate": 0.0006273276360598856, |
| "loss": 1.860851806640625, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.359134889015365, |
| "grad_norm": 0.8160614967346191, |
| "learning_rate": 0.0006241148878750884, |
| "loss": 1.8602349853515625, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.501422879908937, |
| "grad_norm": 0.8240634799003601, |
| "learning_rate": 0.0006209021396902911, |
| "loss": 1.855229736328125, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.643710870802504, |
| "grad_norm": 0.82338947057724, |
| "learning_rate": 0.0006176893915054939, |
| "loss": 1.8570489501953125, |
| "step": 62000 |
| }, |
| { |
| "epoch": 17.785998861696072, |
| "grad_norm": 0.7808911204338074, |
| "learning_rate": 0.0006144766433206965, |
| "loss": 1.8573056640625, |
| "step": 62500 |
| }, |
| { |
| "epoch": 17.92828685258964, |
| "grad_norm": 0.8358622789382935, |
| "learning_rate": 0.0006112638951358993, |
| "loss": 1.860720703125, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.07057484348321, |
| "grad_norm": 0.8733948469161987, |
| "learning_rate": 0.000608051146951102, |
| "loss": 1.8501636962890624, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.21286283437678, |
| "grad_norm": 0.8293560743331909, |
| "learning_rate": 0.0006048383987663047, |
| "loss": 1.8436048583984375, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.355150825270346, |
| "grad_norm": 0.8083025217056274, |
| "learning_rate": 0.0006016256505815074, |
| "loss": 1.845773681640625, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.497438816163914, |
| "grad_norm": 0.8062528967857361, |
| "learning_rate": 0.0005984129023967102, |
| "loss": 1.849447998046875, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.639726807057485, |
| "grad_norm": 0.797907829284668, |
| "learning_rate": 0.0005952001542119128, |
| "loss": 1.8439942626953125, |
| "step": 65500 |
| }, |
| { |
| "epoch": 18.782014797951053, |
| "grad_norm": 0.8070884346961975, |
| "learning_rate": 0.0005919874060271157, |
| "loss": 1.8405416259765626, |
| "step": 66000 |
| }, |
| { |
| "epoch": 18.92430278884462, |
| "grad_norm": 0.8484400510787964, |
| "learning_rate": 0.0005887746578423183, |
| "loss": 1.8404969482421876, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.06659077973819, |
| "grad_norm": 0.8148825168609619, |
| "learning_rate": 0.0005855619096575211, |
| "loss": 1.8372745361328124, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.20887877063176, |
| "grad_norm": 0.7816134691238403, |
| "learning_rate": 0.0005823491614727238, |
| "loss": 1.8346165771484375, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.351166761525327, |
| "grad_norm": 0.8447558283805847, |
| "learning_rate": 0.0005791364132879265, |
| "loss": 1.8296884765625, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.493454752418895, |
| "grad_norm": 0.8260893225669861, |
| "learning_rate": 0.0005759236651031293, |
| "loss": 1.830061279296875, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.635742743312463, |
| "grad_norm": 0.7893286347389221, |
| "learning_rate": 0.000572710916918332, |
| "loss": 1.829275634765625, |
| "step": 69000 |
| }, |
| { |
| "epoch": 19.778030734206034, |
| "grad_norm": 0.8122330904006958, |
| "learning_rate": 0.0005694981687335346, |
| "loss": 1.829096923828125, |
| "step": 69500 |
| }, |
| { |
| "epoch": 19.9203187250996, |
| "grad_norm": 0.825334370136261, |
| "learning_rate": 0.0005662854205487375, |
| "loss": 1.82976708984375, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.06260671599317, |
| "grad_norm": 0.8224254846572876, |
| "learning_rate": 0.0005630726723639401, |
| "loss": 1.8201314697265625, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.204894706886737, |
| "grad_norm": 0.8266887068748474, |
| "learning_rate": 0.0005598599241791428, |
| "loss": 1.821681884765625, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.34718269778031, |
| "grad_norm": 0.8528222441673279, |
| "learning_rate": 0.0005566471759943456, |
| "loss": 1.813265869140625, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.489470688673876, |
| "grad_norm": 0.8061295747756958, |
| "learning_rate": 0.0005534344278095483, |
| "loss": 1.81572412109375, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.631758679567444, |
| "grad_norm": 0.8042652010917664, |
| "learning_rate": 0.000550221679624751, |
| "loss": 1.8168936767578125, |
| "step": 72500 |
| }, |
| { |
| "epoch": 20.77404667046101, |
| "grad_norm": 0.7869358658790588, |
| "learning_rate": 0.0005470089314399538, |
| "loss": 1.814272705078125, |
| "step": 73000 |
| }, |
| { |
| "epoch": 20.916334661354583, |
| "grad_norm": 0.8155378699302673, |
| "learning_rate": 0.0005437961832551564, |
| "loss": 1.81436279296875, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.05862265224815, |
| "grad_norm": 0.7901885509490967, |
| "learning_rate": 0.0005405834350703593, |
| "loss": 1.811807861328125, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.200910643141718, |
| "grad_norm": 0.7788444757461548, |
| "learning_rate": 0.0005373706868855619, |
| "loss": 1.8064168701171874, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.343198634035286, |
| "grad_norm": 0.8150326609611511, |
| "learning_rate": 0.0005341579387007646, |
| "loss": 1.7998411865234376, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.485486624928857, |
| "grad_norm": 0.790658175945282, |
| "learning_rate": 0.0005309451905159674, |
| "loss": 1.8009261474609375, |
| "step": 75500 |
| }, |
| { |
| "epoch": 21.627774615822425, |
| "grad_norm": 0.8362455368041992, |
| "learning_rate": 0.0005277324423311701, |
| "loss": 1.7981884765625, |
| "step": 76000 |
| }, |
| { |
| "epoch": 21.770062606715992, |
| "grad_norm": 0.8072263598442078, |
| "learning_rate": 0.0005245196941463727, |
| "loss": 1.802354248046875, |
| "step": 76500 |
| }, |
| { |
| "epoch": 21.91235059760956, |
| "grad_norm": 0.8608630895614624, |
| "learning_rate": 0.0005213069459615756, |
| "loss": 1.7926978759765626, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.05463858850313, |
| "grad_norm": 0.8350149989128113, |
| "learning_rate": 0.0005180941977767782, |
| "loss": 1.7972198486328126, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.1969265793967, |
| "grad_norm": 0.8605798482894897, |
| "learning_rate": 0.0005148814495919811, |
| "loss": 1.7945047607421876, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.339214570290267, |
| "grad_norm": 0.8054996728897095, |
| "learning_rate": 0.0005116687014071837, |
| "loss": 1.7889041748046874, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.481502561183834, |
| "grad_norm": 0.8018432855606079, |
| "learning_rate": 0.0005084559532223864, |
| "loss": 1.7955711669921874, |
| "step": 79000 |
| }, |
| { |
| "epoch": 22.623790552077406, |
| "grad_norm": 0.8167839050292969, |
| "learning_rate": 0.0005052432050375892, |
| "loss": 1.789824462890625, |
| "step": 79500 |
| }, |
| { |
| "epoch": 22.766078542970973, |
| "grad_norm": 0.8051914572715759, |
| "learning_rate": 0.0005020304568527919, |
| "loss": 1.786076904296875, |
| "step": 80000 |
| }, |
| { |
| "epoch": 22.90836653386454, |
| "grad_norm": 0.8690944314002991, |
| "learning_rate": 0.0004988177086679945, |
| "loss": 1.787760009765625, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.05065452475811, |
| "grad_norm": 0.7857160568237305, |
| "learning_rate": 0.0004956049604831974, |
| "loss": 1.78182080078125, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.19294251565168, |
| "grad_norm": 0.792676568031311, |
| "learning_rate": 0.0004923922122984001, |
| "loss": 1.7775865478515624, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.335230506545248, |
| "grad_norm": 0.8044800162315369, |
| "learning_rate": 0.0004891794641136028, |
| "loss": 1.776404052734375, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.477518497438815, |
| "grad_norm": 0.8092531561851501, |
| "learning_rate": 0.00048596671592880555, |
| "loss": 1.7750899658203125, |
| "step": 82500 |
| }, |
| { |
| "epoch": 23.619806488332383, |
| "grad_norm": 0.8156195878982544, |
| "learning_rate": 0.00048275396774400824, |
| "loss": 1.7721761474609374, |
| "step": 83000 |
| }, |
| { |
| "epoch": 23.762094479225954, |
| "grad_norm": 0.8054773211479187, |
| "learning_rate": 0.000479541219559211, |
| "loss": 1.7737218017578125, |
| "step": 83500 |
| }, |
| { |
| "epoch": 23.904382470119522, |
| "grad_norm": 0.8544410467147827, |
| "learning_rate": 0.0004763284713744137, |
| "loss": 1.7725914306640624, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.04667046101309, |
| "grad_norm": 0.8268908262252808, |
| "learning_rate": 0.00047311572318961645, |
| "loss": 1.7661802978515626, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.188958451906657, |
| "grad_norm": 0.8119874000549316, |
| "learning_rate": 0.00046990297500481914, |
| "loss": 1.7655081787109375, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.33124644280023, |
| "grad_norm": 0.819488525390625, |
| "learning_rate": 0.0004666902268200219, |
| "loss": 1.76497509765625, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.473534433693796, |
| "grad_norm": 0.7789687514305115, |
| "learning_rate": 0.0004634774786352246, |
| "loss": 1.7610302734375, |
| "step": 86000 |
| }, |
| { |
| "epoch": 24.615822424587364, |
| "grad_norm": 0.7585675716400146, |
| "learning_rate": 0.0004602647304504273, |
| "loss": 1.7648392333984375, |
| "step": 86500 |
| }, |
| { |
| "epoch": 24.75811041548093, |
| "grad_norm": 0.7789969444274902, |
| "learning_rate": 0.00045705198226563004, |
| "loss": 1.7639283447265626, |
| "step": 87000 |
| }, |
| { |
| "epoch": 24.900398406374503, |
| "grad_norm": 0.802516758441925, |
| "learning_rate": 0.0004538392340808328, |
| "loss": 1.7633868408203126, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.04268639726807, |
| "grad_norm": 0.8010969161987305, |
| "learning_rate": 0.0004506264858960355, |
| "loss": 1.7580526123046876, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.18497438816164, |
| "grad_norm": 0.8012429475784302, |
| "learning_rate": 0.0004474137377112382, |
| "loss": 1.7505838623046874, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.327262379055206, |
| "grad_norm": 0.8224115967750549, |
| "learning_rate": 0.00044420098952644094, |
| "loss": 1.75313427734375, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.469550369948777, |
| "grad_norm": 0.8361182808876038, |
| "learning_rate": 0.0004409882413416437, |
| "loss": 1.75422998046875, |
| "step": 89500 |
| }, |
| { |
| "epoch": 25.611838360842345, |
| "grad_norm": 0.8170642852783203, |
| "learning_rate": 0.0004377754931568464, |
| "loss": 1.7506724853515625, |
| "step": 90000 |
| }, |
| { |
| "epoch": 25.754126351735913, |
| "grad_norm": 0.7613300085067749, |
| "learning_rate": 0.0004345627449720491, |
| "loss": 1.7525675048828124, |
| "step": 90500 |
| }, |
| { |
| "epoch": 25.89641434262948, |
| "grad_norm": 0.8419134616851807, |
| "learning_rate": 0.00043134999678725184, |
| "loss": 1.75163330078125, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.03870233352305, |
| "grad_norm": 0.7830471992492676, |
| "learning_rate": 0.0004281372486024546, |
| "loss": 1.7453779296875, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.18099032441662, |
| "grad_norm": 0.7521931529045105, |
| "learning_rate": 0.00042492450041765726, |
| "loss": 1.7388763427734375, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.323278315310187, |
| "grad_norm": 0.7945719957351685, |
| "learning_rate": 0.00042171175223286, |
| "loss": 1.74236572265625, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.465566306203755, |
| "grad_norm": 0.8893241882324219, |
| "learning_rate": 0.00041849900404806274, |
| "loss": 1.7380269775390624, |
| "step": 93000 |
| }, |
| { |
| "epoch": 26.607854297097326, |
| "grad_norm": 0.7732031345367432, |
| "learning_rate": 0.0004152862558632655, |
| "loss": 1.73862939453125, |
| "step": 93500 |
| }, |
| { |
| "epoch": 26.750142287990894, |
| "grad_norm": 0.8094469308853149, |
| "learning_rate": 0.00041207350767846816, |
| "loss": 1.7389964599609375, |
| "step": 94000 |
| }, |
| { |
| "epoch": 26.89243027888446, |
| "grad_norm": 0.7714164853096008, |
| "learning_rate": 0.0004088607594936709, |
| "loss": 1.7384852294921875, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.03471826977803, |
| "grad_norm": 0.8252856731414795, |
| "learning_rate": 0.00040564801130887364, |
| "loss": 1.7344244384765626, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.1770062606716, |
| "grad_norm": 0.7841668128967285, |
| "learning_rate": 0.0004024352631240764, |
| "loss": 1.7267880859375, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.319294251565168, |
| "grad_norm": 0.7918238639831543, |
| "learning_rate": 0.00039922251493927906, |
| "loss": 1.7263587646484375, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.461582242458736, |
| "grad_norm": 0.7744492888450623, |
| "learning_rate": 0.0003960097667544818, |
| "loss": 1.727615478515625, |
| "step": 96500 |
| }, |
| { |
| "epoch": 27.603870233352303, |
| "grad_norm": 0.833548367023468, |
| "learning_rate": 0.00039279701856968454, |
| "loss": 1.728632568359375, |
| "step": 97000 |
| }, |
| { |
| "epoch": 27.746158224245875, |
| "grad_norm": 0.7416006326675415, |
| "learning_rate": 0.0003895842703848872, |
| "loss": 1.7263828125, |
| "step": 97500 |
| }, |
| { |
| "epoch": 27.888446215139442, |
| "grad_norm": 0.8390225172042847, |
| "learning_rate": 0.00038637152220008996, |
| "loss": 1.7279501953125, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.03073420603301, |
| "grad_norm": 0.7987998723983765, |
| "learning_rate": 0.0003831587740152927, |
| "loss": 1.7208074951171874, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.173022196926578, |
| "grad_norm": 0.7214421629905701, |
| "learning_rate": 0.00037994602583049544, |
| "loss": 1.7175120849609375, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.31531018782015, |
| "grad_norm": 0.8112098574638367, |
| "learning_rate": 0.0003767332776456981, |
| "loss": 1.717369384765625, |
| "step": 99500 |
| }, |
| { |
| "epoch": 28.457598178713717, |
| "grad_norm": 0.825023353099823, |
| "learning_rate": 0.00037352052946090086, |
| "loss": 1.7164326171875, |
| "step": 100000 |
| }, |
| { |
| "epoch": 28.599886169607284, |
| "grad_norm": 0.7754949331283569, |
| "learning_rate": 0.0003703077812761036, |
| "loss": 1.7194827880859376, |
| "step": 100500 |
| }, |
| { |
| "epoch": 28.742174160500852, |
| "grad_norm": 0.815468430519104, |
| "learning_rate": 0.00036709503309130634, |
| "loss": 1.71721142578125, |
| "step": 101000 |
| }, |
| { |
| "epoch": 28.884462151394423, |
| "grad_norm": 0.7674705982208252, |
| "learning_rate": 0.000363882284906509, |
| "loss": 1.716550048828125, |
| "step": 101500 |
| }, |
| { |
| "epoch": 29.02675014228799, |
| "grad_norm": 0.8314065933227539, |
| "learning_rate": 0.00036066953672171176, |
| "loss": 1.70979833984375, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.16903813318156, |
| "grad_norm": 0.804320752620697, |
| "learning_rate": 0.0003574567885369145, |
| "loss": 1.7049903564453126, |
| "step": 102500 |
| }, |
| { |
| "epoch": 29.311326124075126, |
| "grad_norm": 0.7875852584838867, |
| "learning_rate": 0.0003542440403521172, |
| "loss": 1.70514208984375, |
| "step": 103000 |
| }, |
| { |
| "epoch": 29.453614114968698, |
| "grad_norm": 0.7865288257598877, |
| "learning_rate": 0.0003510312921673199, |
| "loss": 1.7045543212890626, |
| "step": 103500 |
| }, |
| { |
| "epoch": 29.595902105862265, |
| "grad_norm": 0.8601499199867249, |
| "learning_rate": 0.00034781854398252267, |
| "loss": 1.703396728515625, |
| "step": 104000 |
| }, |
| { |
| "epoch": 29.738190096755833, |
| "grad_norm": 0.8171347975730896, |
| "learning_rate": 0.0003446057957977254, |
| "loss": 1.7051766357421876, |
| "step": 104500 |
| }, |
| { |
| "epoch": 29.8804780876494, |
| "grad_norm": 0.8439942598342896, |
| "learning_rate": 0.0003413930476129281, |
| "loss": 1.702958740234375, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.022766078542972, |
| "grad_norm": 0.7871956825256348, |
| "learning_rate": 0.0003381802994281308, |
| "loss": 1.7031265869140626, |
| "step": 105500 |
| }, |
| { |
| "epoch": 30.16505406943654, |
| "grad_norm": 0.8173360228538513, |
| "learning_rate": 0.00033496755124333357, |
| "loss": 1.7017554931640626, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.307342060330107, |
| "grad_norm": 0.8125076293945312, |
| "learning_rate": 0.0003317548030585363, |
| "loss": 1.693796875, |
| "step": 106500 |
| }, |
| { |
| "epoch": 30.449630051223675, |
| "grad_norm": 0.8106098771095276, |
| "learning_rate": 0.000328542054873739, |
| "loss": 1.6901024169921874, |
| "step": 107000 |
| }, |
| { |
| "epoch": 30.591918042117246, |
| "grad_norm": 0.8278952240943909, |
| "learning_rate": 0.00032532930668894173, |
| "loss": 1.6938135986328124, |
| "step": 107500 |
| }, |
| { |
| "epoch": 30.734206033010814, |
| "grad_norm": 0.8017494082450867, |
| "learning_rate": 0.00032211655850414447, |
| "loss": 1.6923419189453126, |
| "step": 108000 |
| }, |
| { |
| "epoch": 30.87649402390438, |
| "grad_norm": 0.825312077999115, |
| "learning_rate": 0.00031890381031934715, |
| "loss": 1.6879945068359374, |
| "step": 108500 |
| }, |
| { |
| "epoch": 31.01878201479795, |
| "grad_norm": 0.8192269802093506, |
| "learning_rate": 0.0003156910621345499, |
| "loss": 1.69059716796875, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.16107000569152, |
| "grad_norm": 0.7704429030418396, |
| "learning_rate": 0.00031247831394975263, |
| "loss": 1.68362158203125, |
| "step": 109500 |
| }, |
| { |
| "epoch": 31.30335799658509, |
| "grad_norm": 0.781888484954834, |
| "learning_rate": 0.00030926556576495537, |
| "loss": 1.6819910888671874, |
| "step": 110000 |
| }, |
| { |
| "epoch": 31.445645987478656, |
| "grad_norm": 0.8602472543716431, |
| "learning_rate": 0.00030605281758015805, |
| "loss": 1.6821741943359374, |
| "step": 110500 |
| }, |
| { |
| "epoch": 31.587933978372224, |
| "grad_norm": 0.7637714743614197, |
| "learning_rate": 0.0003028400693953608, |
| "loss": 1.6784395751953125, |
| "step": 111000 |
| }, |
| { |
| "epoch": 31.730221969265795, |
| "grad_norm": 0.8043729662895203, |
| "learning_rate": 0.00029962732121056353, |
| "loss": 1.681421630859375, |
| "step": 111500 |
| }, |
| { |
| "epoch": 31.872509960159363, |
| "grad_norm": 0.8252000212669373, |
| "learning_rate": 0.00029641457302576627, |
| "loss": 1.681087646484375, |
| "step": 112000 |
| }, |
| { |
| "epoch": 32.014797951052934, |
| "grad_norm": 0.802941083908081, |
| "learning_rate": 0.00029320182484096895, |
| "loss": 1.6753712158203125, |
| "step": 112500 |
| }, |
| { |
| "epoch": 32.1570859419465, |
| "grad_norm": 0.814416766166687, |
| "learning_rate": 0.0002899890766561717, |
| "loss": 1.6733070068359375, |
| "step": 113000 |
| }, |
| { |
| "epoch": 32.29937393284007, |
| "grad_norm": 0.8030642867088318, |
| "learning_rate": 0.00028677632847137443, |
| "loss": 1.6685291748046875, |
| "step": 113500 |
| }, |
| { |
| "epoch": 32.44166192373363, |
| "grad_norm": 0.7646543383598328, |
| "learning_rate": 0.0002835635802865771, |
| "loss": 1.668842041015625, |
| "step": 114000 |
| }, |
| { |
| "epoch": 32.583949914627205, |
| "grad_norm": 0.770729660987854, |
| "learning_rate": 0.00028035083210177985, |
| "loss": 1.675781982421875, |
| "step": 114500 |
| }, |
| { |
| "epoch": 32.726237905520776, |
| "grad_norm": 0.7864305973052979, |
| "learning_rate": 0.0002771380839169826, |
| "loss": 1.6711048583984376, |
| "step": 115000 |
| }, |
| { |
| "epoch": 32.86852589641434, |
| "grad_norm": 0.7970394492149353, |
| "learning_rate": 0.00027392533573218533, |
| "loss": 1.6688397216796875, |
| "step": 115500 |
| }, |
| { |
| "epoch": 33.01081388730791, |
| "grad_norm": 0.7844989895820618, |
| "learning_rate": 0.000270712587547388, |
| "loss": 1.6707685546875, |
| "step": 116000 |
| }, |
| { |
| "epoch": 33.15310187820148, |
| "grad_norm": 0.7790973782539368, |
| "learning_rate": 0.00026749983936259075, |
| "loss": 1.663322509765625, |
| "step": 116500 |
| }, |
| { |
| "epoch": 33.29538986909505, |
| "grad_norm": 0.7947200536727905, |
| "learning_rate": 0.0002642870911777935, |
| "loss": 1.6612364501953125, |
| "step": 117000 |
| }, |
| { |
| "epoch": 33.43767785998862, |
| "grad_norm": 0.7712826728820801, |
| "learning_rate": 0.00026107434299299623, |
| "loss": 1.6583055419921875, |
| "step": 117500 |
| }, |
| { |
| "epoch": 33.57996585088218, |
| "grad_norm": 0.8012517690658569, |
| "learning_rate": 0.0002578615948081989, |
| "loss": 1.6572049560546875, |
| "step": 118000 |
| }, |
| { |
| "epoch": 33.72225384177575, |
| "grad_norm": 0.7550643682479858, |
| "learning_rate": 0.00025464884662340165, |
| "loss": 1.6559423828125, |
| "step": 118500 |
| }, |
| { |
| "epoch": 33.864541832669325, |
| "grad_norm": 0.857598602771759, |
| "learning_rate": 0.0002514360984386044, |
| "loss": 1.655523193359375, |
| "step": 119000 |
| }, |
| { |
| "epoch": 34.00682982356289, |
| "grad_norm": 0.7795873880386353, |
| "learning_rate": 0.0002482233502538071, |
| "loss": 1.6634495849609374, |
| "step": 119500 |
| }, |
| { |
| "epoch": 34.14911781445646, |
| "grad_norm": 0.8217109441757202, |
| "learning_rate": 0.0002450106020690098, |
| "loss": 1.648638427734375, |
| "step": 120000 |
| }, |
| { |
| "epoch": 34.29140580535003, |
| "grad_norm": 0.8032475709915161, |
| "learning_rate": 0.00024179785388421255, |
| "loss": 1.653551025390625, |
| "step": 120500 |
| }, |
| { |
| "epoch": 34.433693796243595, |
| "grad_norm": 0.7690367698669434, |
| "learning_rate": 0.00023858510569941526, |
| "loss": 1.65421923828125, |
| "step": 121000 |
| }, |
| { |
| "epoch": 34.57598178713717, |
| "grad_norm": 0.8075783848762512, |
| "learning_rate": 0.000235372357514618, |
| "loss": 1.64859423828125, |
| "step": 121500 |
| }, |
| { |
| "epoch": 34.71826977803073, |
| "grad_norm": 0.8210867047309875, |
| "learning_rate": 0.00023215960932982071, |
| "loss": 1.650385009765625, |
| "step": 122000 |
| }, |
| { |
| "epoch": 34.8605577689243, |
| "grad_norm": 0.763390839099884, |
| "learning_rate": 0.00022894686114502345, |
| "loss": 1.646854248046875, |
| "step": 122500 |
| }, |
| { |
| "epoch": 35.00284575981787, |
| "grad_norm": 0.7536216378211975, |
| "learning_rate": 0.0002257341129602262, |
| "loss": 1.6420960693359374, |
| "step": 123000 |
| }, |
| { |
| "epoch": 35.14513375071144, |
| "grad_norm": 0.8205796480178833, |
| "learning_rate": 0.00022252136477542893, |
| "loss": 1.6369521484375, |
| "step": 123500 |
| }, |
| { |
| "epoch": 35.28742174160501, |
| "grad_norm": 0.7954224348068237, |
| "learning_rate": 0.00021930861659063164, |
| "loss": 1.6364495849609375, |
| "step": 124000 |
| }, |
| { |
| "epoch": 35.42970973249858, |
| "grad_norm": 0.7882758975028992, |
| "learning_rate": 0.00021609586840583435, |
| "loss": 1.6430433349609375, |
| "step": 124500 |
| }, |
| { |
| "epoch": 35.571997723392144, |
| "grad_norm": 0.7636738419532776, |
| "learning_rate": 0.0002128831202210371, |
| "loss": 1.6426986083984374, |
| "step": 125000 |
| }, |
| { |
| "epoch": 35.714285714285715, |
| "grad_norm": 0.7501616477966309, |
| "learning_rate": 0.0002096703720362398, |
| "loss": 1.6390101318359376, |
| "step": 125500 |
| }, |
| { |
| "epoch": 35.85657370517928, |
| "grad_norm": 0.8382527232170105, |
| "learning_rate": 0.00020645762385144254, |
| "loss": 1.635440673828125, |
| "step": 126000 |
| }, |
| { |
| "epoch": 35.99886169607285, |
| "grad_norm": 0.7988425493240356, |
| "learning_rate": 0.00020324487566664525, |
| "loss": 1.638254150390625, |
| "step": 126500 |
| }, |
| { |
| "epoch": 36.14114968696642, |
| "grad_norm": 0.791145384311676, |
| "learning_rate": 0.000200032127481848, |
| "loss": 1.62477197265625, |
| "step": 127000 |
| }, |
| { |
| "epoch": 36.283437677859986, |
| "grad_norm": 0.7497517466545105, |
| "learning_rate": 0.0001968193792970507, |
| "loss": 1.632613037109375, |
| "step": 127500 |
| }, |
| { |
| "epoch": 36.42572566875356, |
| "grad_norm": 0.8118484020233154, |
| "learning_rate": 0.00019360663111225344, |
| "loss": 1.63175146484375, |
| "step": 128000 |
| }, |
| { |
| "epoch": 36.56801365964713, |
| "grad_norm": 0.8277371525764465, |
| "learning_rate": 0.00019039388292745615, |
| "loss": 1.6289234619140625, |
| "step": 128500 |
| }, |
| { |
| "epoch": 36.71030165054069, |
| "grad_norm": 0.7911479473114014, |
| "learning_rate": 0.0001871811347426589, |
| "loss": 1.6251759033203126, |
| "step": 129000 |
| }, |
| { |
| "epoch": 36.852589641434264, |
| "grad_norm": 0.7872730493545532, |
| "learning_rate": 0.0001839683865578616, |
| "loss": 1.6268780517578125, |
| "step": 129500 |
| }, |
| { |
| "epoch": 36.99487763232783, |
| "grad_norm": 0.7649409174919128, |
| "learning_rate": 0.00018075563837306432, |
| "loss": 1.6268775634765624, |
| "step": 130000 |
| }, |
| { |
| "epoch": 37.1371656232214, |
| "grad_norm": 0.8168938159942627, |
| "learning_rate": 0.00017754289018826706, |
| "loss": 1.61990966796875, |
| "step": 130500 |
| }, |
| { |
| "epoch": 37.27945361411497, |
| "grad_norm": 0.790477454662323, |
| "learning_rate": 0.00017433014200346977, |
| "loss": 1.618332275390625, |
| "step": 131000 |
| }, |
| { |
| "epoch": 37.421741605008535, |
| "grad_norm": 0.7928410768508911, |
| "learning_rate": 0.0001711173938186725, |
| "loss": 1.6161146240234374, |
| "step": 131500 |
| }, |
| { |
| "epoch": 37.564029595902106, |
| "grad_norm": 0.7818522453308105, |
| "learning_rate": 0.00016790464563387522, |
| "loss": 1.614980712890625, |
| "step": 132000 |
| }, |
| { |
| "epoch": 37.70631758679568, |
| "grad_norm": 0.7645919322967529, |
| "learning_rate": 0.00016469189744907796, |
| "loss": 1.6168826904296876, |
| "step": 132500 |
| }, |
| { |
| "epoch": 37.84860557768924, |
| "grad_norm": 0.7470947504043579, |
| "learning_rate": 0.00016147914926428067, |
| "loss": 1.6159892578125, |
| "step": 133000 |
| }, |
| { |
| "epoch": 37.99089356858281, |
| "grad_norm": 0.781106173992157, |
| "learning_rate": 0.0001582664010794834, |
| "loss": 1.6154237060546874, |
| "step": 133500 |
| }, |
| { |
| "epoch": 38.13318155947638, |
| "grad_norm": 0.8540311455726624, |
| "learning_rate": 0.00015505365289468612, |
| "loss": 1.6088677978515624, |
| "step": 134000 |
| }, |
| { |
| "epoch": 38.27546955036995, |
| "grad_norm": 0.8147649168968201, |
| "learning_rate": 0.00015184090470988886, |
| "loss": 1.608274658203125, |
| "step": 134500 |
| }, |
| { |
| "epoch": 38.41775754126352, |
| "grad_norm": 0.7976606488227844, |
| "learning_rate": 0.00014862815652509157, |
| "loss": 1.6079510498046874, |
| "step": 135000 |
| }, |
| { |
| "epoch": 38.56004553215708, |
| "grad_norm": 0.7808454036712646, |
| "learning_rate": 0.00014541540834029428, |
| "loss": 1.6069224853515625, |
| "step": 135500 |
| }, |
| { |
| "epoch": 38.702333523050655, |
| "grad_norm": 0.7801294922828674, |
| "learning_rate": 0.00014220266015549702, |
| "loss": 1.6083843994140625, |
| "step": 136000 |
| }, |
| { |
| "epoch": 38.844621513944226, |
| "grad_norm": 0.7674131989479065, |
| "learning_rate": 0.00013898991197069973, |
| "loss": 1.605281005859375, |
| "step": 136500 |
| }, |
| { |
| "epoch": 38.98690950483779, |
| "grad_norm": 0.7959563732147217, |
| "learning_rate": 0.00013577716378590247, |
| "loss": 1.6055640869140626, |
| "step": 137000 |
| }, |
| { |
| "epoch": 39.12919749573136, |
| "grad_norm": 0.8412840962409973, |
| "learning_rate": 0.00013256441560110518, |
| "loss": 1.6029959716796875, |
| "step": 137500 |
| }, |
| { |
| "epoch": 39.271485486624925, |
| "grad_norm": 0.8026737570762634, |
| "learning_rate": 0.00012935166741630792, |
| "loss": 1.600421875, |
| "step": 138000 |
| }, |
| { |
| "epoch": 39.4137734775185, |
| "grad_norm": 0.7729793190956116, |
| "learning_rate": 0.00012613891923151063, |
| "loss": 1.5968153076171876, |
| "step": 138500 |
| }, |
| { |
| "epoch": 39.55606146841207, |
| "grad_norm": 0.8124834299087524, |
| "learning_rate": 0.00012292617104671337, |
| "loss": 1.5938275146484375, |
| "step": 139000 |
| }, |
| { |
| "epoch": 39.69834945930563, |
| "grad_norm": 0.8108460307121277, |
| "learning_rate": 0.00011971342286191608, |
| "loss": 1.5965054931640625, |
| "step": 139500 |
| }, |
| { |
| "epoch": 39.8406374501992, |
| "grad_norm": 0.7830720543861389, |
| "learning_rate": 0.0001165006746771188, |
| "loss": 1.5987293701171874, |
| "step": 140000 |
| }, |
| { |
| "epoch": 39.982925441092775, |
| "grad_norm": 0.7493749856948853, |
| "learning_rate": 0.00011328792649232153, |
| "loss": 1.5955545654296874, |
| "step": 140500 |
| }, |
| { |
| "epoch": 40.12521343198634, |
| "grad_norm": 0.7606357932090759, |
| "learning_rate": 0.00011007517830752426, |
| "loss": 1.5879793701171876, |
| "step": 141000 |
| }, |
| { |
| "epoch": 40.26750142287991, |
| "grad_norm": 0.7698628306388855, |
| "learning_rate": 0.00010686243012272698, |
| "loss": 1.5900478515625, |
| "step": 141500 |
| }, |
| { |
| "epoch": 40.409789413773474, |
| "grad_norm": 0.8203583359718323, |
| "learning_rate": 0.0001036496819379297, |
| "loss": 1.5890828857421875, |
| "step": 142000 |
| }, |
| { |
| "epoch": 40.552077404667045, |
| "grad_norm": 0.7594188451766968, |
| "learning_rate": 0.00010043693375313243, |
| "loss": 1.5884957275390625, |
| "step": 142500 |
| }, |
| { |
| "epoch": 40.69436539556062, |
| "grad_norm": 0.8142854571342468, |
| "learning_rate": 9.722418556833516e-05, |
| "loss": 1.585271728515625, |
| "step": 143000 |
| }, |
| { |
| "epoch": 40.83665338645418, |
| "grad_norm": 0.867124080657959, |
| "learning_rate": 9.401143738353787e-05, |
| "loss": 1.584123046875, |
| "step": 143500 |
| }, |
| { |
| "epoch": 40.97894137734775, |
| "grad_norm": 0.7953840494155884, |
| "learning_rate": 9.07986891987406e-05, |
| "loss": 1.5837237548828125, |
| "step": 144000 |
| }, |
| { |
| "epoch": 41.12122936824132, |
| "grad_norm": 0.782312273979187, |
| "learning_rate": 8.758594101394333e-05, |
| "loss": 1.58353369140625, |
| "step": 144500 |
| }, |
| { |
| "epoch": 41.26351735913489, |
| "grad_norm": 0.7618634700775146, |
| "learning_rate": 8.437319282914606e-05, |
| "loss": 1.58224267578125, |
| "step": 145000 |
| }, |
| { |
| "epoch": 41.40580535002846, |
| "grad_norm": 0.8059329986572266, |
| "learning_rate": 8.116044464434878e-05, |
| "loss": 1.5730921630859376, |
| "step": 145500 |
| }, |
| { |
| "epoch": 41.54809334092202, |
| "grad_norm": 0.756024181842804, |
| "learning_rate": 7.79476964595515e-05, |
| "loss": 1.57981982421875, |
| "step": 146000 |
| }, |
| { |
| "epoch": 41.690381331815594, |
| "grad_norm": 0.8015180826187134, |
| "learning_rate": 7.473494827475423e-05, |
| "loss": 1.578093505859375, |
| "step": 146500 |
| }, |
| { |
| "epoch": 41.832669322709165, |
| "grad_norm": 0.7689457535743713, |
| "learning_rate": 7.152220008995696e-05, |
| "loss": 1.575974365234375, |
| "step": 147000 |
| }, |
| { |
| "epoch": 41.97495731360273, |
| "grad_norm": 0.7837737798690796, |
| "learning_rate": 6.830945190515968e-05, |
| "loss": 1.57426171875, |
| "step": 147500 |
| }, |
| { |
| "epoch": 42.1172453044963, |
| "grad_norm": 0.758391797542572, |
| "learning_rate": 6.509670372036241e-05, |
| "loss": 1.5727047119140625, |
| "step": 148000 |
| }, |
| { |
| "epoch": 42.25953329538987, |
| "grad_norm": 0.7854096293449402, |
| "learning_rate": 6.188395553556513e-05, |
| "loss": 1.56796923828125, |
| "step": 148500 |
| }, |
| { |
| "epoch": 42.401821286283436, |
| "grad_norm": 0.7863969206809998, |
| "learning_rate": 5.867120735076785e-05, |
| "loss": 1.571415771484375, |
| "step": 149000 |
| }, |
| { |
| "epoch": 42.54410927717701, |
| "grad_norm": 0.7552393674850464, |
| "learning_rate": 5.5458459165970575e-05, |
| "loss": 1.5717161865234375, |
| "step": 149500 |
| }, |
| { |
| "epoch": 42.68639726807057, |
| "grad_norm": 0.7907793521881104, |
| "learning_rate": 5.22457109811733e-05, |
| "loss": 1.565153564453125, |
| "step": 150000 |
| }, |
| { |
| "epoch": 42.82868525896414, |
| "grad_norm": 0.8196442127227783, |
| "learning_rate": 4.903296279637602e-05, |
| "loss": 1.5695428466796875, |
| "step": 150500 |
| }, |
| { |
| "epoch": 42.970973249857714, |
| "grad_norm": 0.7759655714035034, |
| "learning_rate": 4.5820214611578744e-05, |
| "loss": 1.5666607666015624, |
| "step": 151000 |
| }, |
| { |
| "epoch": 43.11326124075128, |
| "grad_norm": 0.7889202237129211, |
| "learning_rate": 4.260746642678147e-05, |
| "loss": 1.55801416015625, |
| "step": 151500 |
| }, |
| { |
| "epoch": 43.25554923164485, |
| "grad_norm": 0.7735246419906616, |
| "learning_rate": 3.9394718241984194e-05, |
| "loss": 1.563793212890625, |
| "step": 152000 |
| }, |
| { |
| "epoch": 43.39783722253842, |
| "grad_norm": 0.783096432685852, |
| "learning_rate": 3.618197005718692e-05, |
| "loss": 1.564925048828125, |
| "step": 152500 |
| }, |
| { |
| "epoch": 43.540125213431985, |
| "grad_norm": 0.8447062373161316, |
| "learning_rate": 3.296922187238964e-05, |
| "loss": 1.561072509765625, |
| "step": 153000 |
| }, |
| { |
| "epoch": 43.682413204325556, |
| "grad_norm": 0.8156583905220032, |
| "learning_rate": 2.9756473687592366e-05, |
| "loss": 1.5635389404296876, |
| "step": 153500 |
| }, |
| { |
| "epoch": 43.82470119521912, |
| "grad_norm": 0.8073771595954895, |
| "learning_rate": 2.654372550279509e-05, |
| "loss": 1.55376953125, |
| "step": 154000 |
| }, |
| { |
| "epoch": 43.96698918611269, |
| "grad_norm": 0.8184100389480591, |
| "learning_rate": 2.3330977317997816e-05, |
| "loss": 1.5572464599609375, |
| "step": 154500 |
| }, |
| { |
| "epoch": 44.10927717700626, |
| "grad_norm": 0.7976749539375305, |
| "learning_rate": 2.0118229133200538e-05, |
| "loss": 1.556713623046875, |
| "step": 155000 |
| }, |
| { |
| "epoch": 44.25156516789983, |
| "grad_norm": 0.7821652889251709, |
| "learning_rate": 1.6905480948403263e-05, |
| "loss": 1.5601466064453124, |
| "step": 155500 |
| }, |
| { |
| "epoch": 44.3938531587934, |
| "grad_norm": 0.7849637269973755, |
| "learning_rate": 1.3692732763605988e-05, |
| "loss": 1.555856201171875, |
| "step": 156000 |
| }, |
| { |
| "epoch": 44.53614114968697, |
| "grad_norm": 0.8034117221832275, |
| "learning_rate": 1.0479984578808712e-05, |
| "loss": 1.5571749267578125, |
| "step": 156500 |
| }, |
| { |
| "epoch": 44.67842914058053, |
| "grad_norm": 0.7568624019622803, |
| "learning_rate": 7.267236394011437e-06, |
| "loss": 1.553765625, |
| "step": 157000 |
| }, |
| { |
| "epoch": 44.820717131474105, |
| "grad_norm": 0.7983436584472656, |
| "learning_rate": 4.054488209214162e-06, |
| "loss": 1.55248681640625, |
| "step": 157500 |
| }, |
| { |
| "epoch": 44.96300512236767, |
| "grad_norm": 0.8003237247467041, |
| "learning_rate": 8.417400244168862e-07, |
| "loss": 1.554723388671875, |
| "step": 158000 |
| }, |
| { |
| "epoch": 45.0, |
| "step": 158130, |
| "total_flos": 3.365420849747251e+17, |
| "train_loss": 1.929684388318656, |
| "train_runtime": 25758.7104, |
| "train_samples_per_second": 392.825, |
| "train_steps_per_second": 6.139 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 158130, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 45, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.365420849747251e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|