Safetensors
English
bert
astrobert-small / trainer_state.json
davidmezzetti's picture
Add model
a84fcbf
Raw
History Blame Contribute Delete
59.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 45.0,
"eval_steps": 500,
"global_step": 158130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14228799089356858,
"grad_norm": 0.6334199905395508,
"learning_rate": 0.0001996,
"loss": 7.48322509765625,
"step": 500
},
{
"epoch": 0.28457598178713717,
"grad_norm": 1.0710968971252441,
"learning_rate": 0.0003996,
"loss": 6.2563916015625,
"step": 1000
},
{
"epoch": 0.42686397268070575,
"grad_norm": 0.6296343207359314,
"learning_rate": 0.0005996,
"loss": 6.11807275390625,
"step": 1500
},
{
"epoch": 0.5691519635742743,
"grad_norm": 0.9468093514442444,
"learning_rate": 0.0007996,
"loss": 6.00693505859375,
"step": 2000
},
{
"epoch": 0.7114399544678429,
"grad_norm": 1.3960775136947632,
"learning_rate": 0.0009996,
"loss": 5.5765498046875,
"step": 2500
},
{
"epoch": 0.8537279453614115,
"grad_norm": 1.1396394968032837,
"learning_rate": 0.0009967936773115724,
"loss": 4.27981396484375,
"step": 3000
},
{
"epoch": 0.9960159362549801,
"grad_norm": 1.1924173831939697,
"learning_rate": 0.000993580929126775,
"loss": 3.510773193359375,
"step": 3500
},
{
"epoch": 1.1383039271485487,
"grad_norm": 1.1864285469055176,
"learning_rate": 0.0009903681809419777,
"loss": 3.184400390625,
"step": 4000
},
{
"epoch": 1.2805919180421172,
"grad_norm": 0.9659832119941711,
"learning_rate": 0.0009871554327571805,
"loss": 2.9972861328125,
"step": 4500
},
{
"epoch": 1.4228799089356858,
"grad_norm": 1.0554136037826538,
"learning_rate": 0.0009839426845723832,
"loss": 2.87672265625,
"step": 5000
},
{
"epoch": 1.5651678998292544,
"grad_norm": 0.989261269569397,
"learning_rate": 0.000980729936387586,
"loss": 2.784982177734375,
"step": 5500
},
{
"epoch": 1.707455890722823,
"grad_norm": 1.0604394674301147,
"learning_rate": 0.0009775171882027887,
"loss": 2.708709716796875,
"step": 6000
},
{
"epoch": 1.8497438816163916,
"grad_norm": 0.9650368094444275,
"learning_rate": 0.0009743044400179915,
"loss": 2.65090625,
"step": 6500
},
{
"epoch": 1.9920318725099602,
"grad_norm": 0.9355012774467468,
"learning_rate": 0.0009710916918331941,
"loss": 2.607532470703125,
"step": 7000
},
{
"epoch": 2.1343198634035287,
"grad_norm": 0.962996780872345,
"learning_rate": 0.0009678789436483969,
"loss": 2.54883447265625,
"step": 7500
},
{
"epoch": 2.2766078542970973,
"grad_norm": 1.0536885261535645,
"learning_rate": 0.0009646661954635996,
"loss": 2.514782958984375,
"step": 8000
},
{
"epoch": 2.418895845190666,
"grad_norm": 1.0663453340530396,
"learning_rate": 0.0009614534472788023,
"loss": 2.487317138671875,
"step": 8500
},
{
"epoch": 2.5611838360842345,
"grad_norm": 0.9626434445381165,
"learning_rate": 0.000958240699094005,
"loss": 2.468934814453125,
"step": 9000
},
{
"epoch": 2.703471826977803,
"grad_norm": 0.9301921129226685,
"learning_rate": 0.0009550279509092078,
"loss": 2.447890380859375,
"step": 9500
},
{
"epoch": 2.8457598178713717,
"grad_norm": 1.0070905685424805,
"learning_rate": 0.0009518152027244105,
"loss": 2.418142333984375,
"step": 10000
},
{
"epoch": 2.9880478087649402,
"grad_norm": 0.9236523509025574,
"learning_rate": 0.0009486024545396133,
"loss": 2.40807373046875,
"step": 10500
},
{
"epoch": 3.130335799658509,
"grad_norm": 0.9421653747558594,
"learning_rate": 0.0009453897063548159,
"loss": 2.368919677734375,
"step": 11000
},
{
"epoch": 3.2726237905520774,
"grad_norm": 0.9248610138893127,
"learning_rate": 0.0009421769581700187,
"loss": 2.36261572265625,
"step": 11500
},
{
"epoch": 3.414911781445646,
"grad_norm": 0.9469349384307861,
"learning_rate": 0.0009389642099852214,
"loss": 2.3440390625,
"step": 12000
},
{
"epoch": 3.5571997723392146,
"grad_norm": 1.0009181499481201,
"learning_rate": 0.0009357514618004241,
"loss": 2.32732666015625,
"step": 12500
},
{
"epoch": 3.699487763232783,
"grad_norm": 0.9607020020484924,
"learning_rate": 0.0009325387136156268,
"loss": 2.313716552734375,
"step": 13000
},
{
"epoch": 3.8417757541263517,
"grad_norm": 0.9231477975845337,
"learning_rate": 0.0009293259654308296,
"loss": 2.30414453125,
"step": 13500
},
{
"epoch": 3.9840637450199203,
"grad_norm": 0.9701341986656189,
"learning_rate": 0.0009261132172460323,
"loss": 2.2853681640625,
"step": 14000
},
{
"epoch": 4.126351735913489,
"grad_norm": 0.9349779486656189,
"learning_rate": 0.000922900469061235,
"loss": 2.26698486328125,
"step": 14500
},
{
"epoch": 4.2686397268070575,
"grad_norm": 0.9157093167304993,
"learning_rate": 0.0009196877208764377,
"loss": 2.256426513671875,
"step": 15000
},
{
"epoch": 4.410927717700626,
"grad_norm": 0.9236478209495544,
"learning_rate": 0.0009164749726916405,
"loss": 2.25071337890625,
"step": 15500
},
{
"epoch": 4.553215708594195,
"grad_norm": 0.9309902191162109,
"learning_rate": 0.0009132622245068431,
"loss": 2.2407861328125,
"step": 16000
},
{
"epoch": 4.695503699487763,
"grad_norm": 0.9968065619468689,
"learning_rate": 0.000910049476322046,
"loss": 2.232171142578125,
"step": 16500
},
{
"epoch": 4.837791690381332,
"grad_norm": 0.893164336681366,
"learning_rate": 0.0009068367281372486,
"loss": 2.223586669921875,
"step": 17000
},
{
"epoch": 4.9800796812749,
"grad_norm": 0.9570403695106506,
"learning_rate": 0.0009036239799524514,
"loss": 2.2169453125,
"step": 17500
},
{
"epoch": 5.122367672168469,
"grad_norm": 0.9152570962905884,
"learning_rate": 0.0009004112317676541,
"loss": 2.1965634765625,
"step": 18000
},
{
"epoch": 5.264655663062038,
"grad_norm": 0.9538166522979736,
"learning_rate": 0.0008971984835828568,
"loss": 2.18993994140625,
"step": 18500
},
{
"epoch": 5.406943653955606,
"grad_norm": 0.8849217891693115,
"learning_rate": 0.0008939857353980595,
"loss": 2.179218994140625,
"step": 19000
},
{
"epoch": 5.549231644849175,
"grad_norm": 0.9768293499946594,
"learning_rate": 0.0008907729872132623,
"loss": 2.174207763671875,
"step": 19500
},
{
"epoch": 5.691519635742743,
"grad_norm": 0.9277469515800476,
"learning_rate": 0.0008875602390284649,
"loss": 2.1700234375,
"step": 20000
},
{
"epoch": 5.833807626636312,
"grad_norm": 0.9375103116035461,
"learning_rate": 0.0008843474908436677,
"loss": 2.166572998046875,
"step": 20500
},
{
"epoch": 5.9760956175298805,
"grad_norm": 0.9594961404800415,
"learning_rate": 0.0008811347426588704,
"loss": 2.15810888671875,
"step": 21000
},
{
"epoch": 6.118383608423449,
"grad_norm": 0.8761999607086182,
"learning_rate": 0.0008779219944740731,
"loss": 2.13630029296875,
"step": 21500
},
{
"epoch": 6.260671599317018,
"grad_norm": 0.8995809555053711,
"learning_rate": 0.0008747092462892759,
"loss": 2.13485693359375,
"step": 22000
},
{
"epoch": 6.402959590210586,
"grad_norm": 0.9557548761367798,
"learning_rate": 0.0008714964981044786,
"loss": 2.12809130859375,
"step": 22500
},
{
"epoch": 6.545247581104155,
"grad_norm": 0.8720033168792725,
"learning_rate": 0.0008682837499196813,
"loss": 2.12343359375,
"step": 23000
},
{
"epoch": 6.687535571997723,
"grad_norm": 0.9293733239173889,
"learning_rate": 0.0008650710017348841,
"loss": 2.121170166015625,
"step": 23500
},
{
"epoch": 6.829823562891292,
"grad_norm": 0.9301312565803528,
"learning_rate": 0.0008618582535500867,
"loss": 2.115037841796875,
"step": 24000
},
{
"epoch": 6.972111553784861,
"grad_norm": 0.9322926998138428,
"learning_rate": 0.0008586455053652895,
"loss": 2.11170068359375,
"step": 24500
},
{
"epoch": 7.114399544678429,
"grad_norm": 0.9266783595085144,
"learning_rate": 0.0008554327571804922,
"loss": 2.0983544921875,
"step": 25000
},
{
"epoch": 7.256687535571998,
"grad_norm": 0.9406460523605347,
"learning_rate": 0.0008522200089956949,
"loss": 2.08985888671875,
"step": 25500
},
{
"epoch": 7.398975526465566,
"grad_norm": 0.923007607460022,
"learning_rate": 0.0008490072608108977,
"loss": 2.093153564453125,
"step": 26000
},
{
"epoch": 7.541263517359135,
"grad_norm": 0.9110872149467468,
"learning_rate": 0.0008457945126261004,
"loss": 2.080158447265625,
"step": 26500
},
{
"epoch": 7.6835515082527035,
"grad_norm": 0.9028299450874329,
"learning_rate": 0.000842581764441303,
"loss": 2.0810068359375,
"step": 27000
},
{
"epoch": 7.825839499146272,
"grad_norm": 0.9071224927902222,
"learning_rate": 0.0008393690162565059,
"loss": 2.08135595703125,
"step": 27500
},
{
"epoch": 7.968127490039841,
"grad_norm": 0.8702104687690735,
"learning_rate": 0.0008361562680717085,
"loss": 2.073836181640625,
"step": 28000
},
{
"epoch": 8.110415480933408,
"grad_norm": 0.9875132441520691,
"learning_rate": 0.0008329435198869114,
"loss": 2.06207470703125,
"step": 28500
},
{
"epoch": 8.252703471826978,
"grad_norm": 0.8733546733856201,
"learning_rate": 0.000829730771702114,
"loss": 2.0590986328125,
"step": 29000
},
{
"epoch": 8.394991462720546,
"grad_norm": 0.8623999357223511,
"learning_rate": 0.0008265180235173167,
"loss": 2.05387744140625,
"step": 29500
},
{
"epoch": 8.537279453614115,
"grad_norm": 0.9093107581138611,
"learning_rate": 0.0008233052753325195,
"loss": 2.049600830078125,
"step": 30000
},
{
"epoch": 8.679567444507683,
"grad_norm": 0.8935321569442749,
"learning_rate": 0.0008200925271477222,
"loss": 2.04915966796875,
"step": 30500
},
{
"epoch": 8.821855435401252,
"grad_norm": 0.9517919421195984,
"learning_rate": 0.0008168797789629248,
"loss": 2.044854248046875,
"step": 31000
},
{
"epoch": 8.964143426294822,
"grad_norm": 0.924595057964325,
"learning_rate": 0.0008136670307781277,
"loss": 2.0417685546875,
"step": 31500
},
{
"epoch": 9.10643141718839,
"grad_norm": 0.8856834173202515,
"learning_rate": 0.0008104542825933303,
"loss": 2.0327542724609375,
"step": 32000
},
{
"epoch": 9.248719408081957,
"grad_norm": 0.851649820804596,
"learning_rate": 0.000807241534408533,
"loss": 2.026624267578125,
"step": 32500
},
{
"epoch": 9.391007398975526,
"grad_norm": 0.8763725161552429,
"learning_rate": 0.0008040287862237358,
"loss": 2.023781982421875,
"step": 33000
},
{
"epoch": 9.533295389869096,
"grad_norm": 0.9054471850395203,
"learning_rate": 0.0008008160380389385,
"loss": 2.0217911376953124,
"step": 33500
},
{
"epoch": 9.675583380762664,
"grad_norm": 0.9936219453811646,
"learning_rate": 0.0007976032898541413,
"loss": 2.0223516845703124,
"step": 34000
},
{
"epoch": 9.817871371656231,
"grad_norm": 0.8445395231246948,
"learning_rate": 0.000794390541669344,
"loss": 2.0150150146484376,
"step": 34500
},
{
"epoch": 9.9601593625498,
"grad_norm": 0.8997848629951477,
"learning_rate": 0.0007911777934845466,
"loss": 2.0144486083984376,
"step": 35000
},
{
"epoch": 10.102447353443369,
"grad_norm": 0.9022369384765625,
"learning_rate": 0.0007879650452997495,
"loss": 2.0057178955078125,
"step": 35500
},
{
"epoch": 10.244735344336938,
"grad_norm": 0.8679398894309998,
"learning_rate": 0.0007847522971149521,
"loss": 1.996031005859375,
"step": 36000
},
{
"epoch": 10.387023335230506,
"grad_norm": 0.8584362864494324,
"learning_rate": 0.0007815395489301548,
"loss": 1.9979686279296875,
"step": 36500
},
{
"epoch": 10.529311326124075,
"grad_norm": 0.919893741607666,
"learning_rate": 0.0007783268007453576,
"loss": 1.9953302001953126,
"step": 37000
},
{
"epoch": 10.671599317017645,
"grad_norm": 0.8829244375228882,
"learning_rate": 0.0007751140525605603,
"loss": 1.9904837646484375,
"step": 37500
},
{
"epoch": 10.813887307911212,
"grad_norm": 0.8700593113899231,
"learning_rate": 0.000771901304375763,
"loss": 1.9878653564453126,
"step": 38000
},
{
"epoch": 10.95617529880478,
"grad_norm": 0.8948189616203308,
"learning_rate": 0.0007686885561909658,
"loss": 1.9861346435546876,
"step": 38500
},
{
"epoch": 11.09846328969835,
"grad_norm": 0.8743970990180969,
"learning_rate": 0.0007654758080061684,
"loss": 1.9781466064453126,
"step": 39000
},
{
"epoch": 11.240751280591917,
"grad_norm": 0.8927684426307678,
"learning_rate": 0.0007622630598213713,
"loss": 1.9800474853515626,
"step": 39500
},
{
"epoch": 11.383039271485487,
"grad_norm": 0.8602464199066162,
"learning_rate": 0.0007590503116365739,
"loss": 1.969161865234375,
"step": 40000
},
{
"epoch": 11.525327262379054,
"grad_norm": 0.9330834746360779,
"learning_rate": 0.0007558375634517766,
"loss": 1.9744647216796876,
"step": 40500
},
{
"epoch": 11.667615253272624,
"grad_norm": 0.8487387895584106,
"learning_rate": 0.0007526248152669794,
"loss": 1.9631361083984376,
"step": 41000
},
{
"epoch": 11.809903244166193,
"grad_norm": 0.8121556639671326,
"learning_rate": 0.0007494120670821821,
"loss": 1.9670977783203125,
"step": 41500
},
{
"epoch": 11.952191235059761,
"grad_norm": 0.8926526308059692,
"learning_rate": 0.0007461993188973848,
"loss": 1.962933349609375,
"step": 42000
},
{
"epoch": 12.094479225953329,
"grad_norm": 0.8671021461486816,
"learning_rate": 0.0007429865707125876,
"loss": 1.9563138427734375,
"step": 42500
},
{
"epoch": 12.236767216846898,
"grad_norm": 0.8316618800163269,
"learning_rate": 0.0007397738225277902,
"loss": 1.9496668701171875,
"step": 43000
},
{
"epoch": 12.379055207740466,
"grad_norm": 0.8220306038856506,
"learning_rate": 0.000736561074342993,
"loss": 1.9505400390625,
"step": 43500
},
{
"epoch": 12.521343198634035,
"grad_norm": 0.882720410823822,
"learning_rate": 0.0007333483261581957,
"loss": 1.9462012939453126,
"step": 44000
},
{
"epoch": 12.663631189527603,
"grad_norm": 0.8841201663017273,
"learning_rate": 0.0007301355779733984,
"loss": 1.94572216796875,
"step": 44500
},
{
"epoch": 12.805919180421172,
"grad_norm": 0.8917742967605591,
"learning_rate": 0.0007269228297886012,
"loss": 1.9462489013671875,
"step": 45000
},
{
"epoch": 12.948207171314742,
"grad_norm": 0.8612926006317139,
"learning_rate": 0.0007237100816038039,
"loss": 1.9503948974609375,
"step": 45500
},
{
"epoch": 13.09049516220831,
"grad_norm": 0.9925726652145386,
"learning_rate": 0.0007204973334190066,
"loss": 1.9372962646484375,
"step": 46000
},
{
"epoch": 13.232783153101877,
"grad_norm": 0.8212889432907104,
"learning_rate": 0.0007172845852342094,
"loss": 1.93100146484375,
"step": 46500
},
{
"epoch": 13.375071143995447,
"grad_norm": 0.8393476605415344,
"learning_rate": 0.000714071837049412,
"loss": 1.9355791015625,
"step": 47000
},
{
"epoch": 13.517359134889016,
"grad_norm": 0.876530647277832,
"learning_rate": 0.0007108590888646148,
"loss": 1.929031982421875,
"step": 47500
},
{
"epoch": 13.659647125782584,
"grad_norm": 0.9148900508880615,
"learning_rate": 0.0007076463406798175,
"loss": 1.9321796875,
"step": 48000
},
{
"epoch": 13.801935116676152,
"grad_norm": 0.8540393114089966,
"learning_rate": 0.0007044335924950202,
"loss": 1.9232811279296875,
"step": 48500
},
{
"epoch": 13.944223107569721,
"grad_norm": 0.874427080154419,
"learning_rate": 0.0007012208443102229,
"loss": 1.920580078125,
"step": 49000
},
{
"epoch": 14.086511098463289,
"grad_norm": 0.9040400385856628,
"learning_rate": 0.0006980080961254257,
"loss": 1.912923095703125,
"step": 49500
},
{
"epoch": 14.228799089356858,
"grad_norm": 0.9279779195785522,
"learning_rate": 0.0006947953479406284,
"loss": 1.9119281005859374,
"step": 50000
},
{
"epoch": 14.371087080250426,
"grad_norm": 0.8857927322387695,
"learning_rate": 0.0006915825997558312,
"loss": 1.907870361328125,
"step": 50500
},
{
"epoch": 14.513375071143995,
"grad_norm": 0.8245786428451538,
"learning_rate": 0.0006883698515710338,
"loss": 1.9115518798828126,
"step": 51000
},
{
"epoch": 14.655663062037565,
"grad_norm": 0.8284105658531189,
"learning_rate": 0.0006851571033862366,
"loss": 1.91026611328125,
"step": 51500
},
{
"epoch": 14.797951052931133,
"grad_norm": 0.8700644969940186,
"learning_rate": 0.0006819443552014393,
"loss": 1.9089705810546875,
"step": 52000
},
{
"epoch": 14.9402390438247,
"grad_norm": 0.8545106649398804,
"learning_rate": 0.000678731607016642,
"loss": 1.9038966064453124,
"step": 52500
},
{
"epoch": 15.08252703471827,
"grad_norm": 0.8972774744033813,
"learning_rate": 0.0006755188588318448,
"loss": 1.9033292236328125,
"step": 53000
},
{
"epoch": 15.224815025611838,
"grad_norm": 0.8351185321807861,
"learning_rate": 0.0006723061106470475,
"loss": 1.8937845458984375,
"step": 53500
},
{
"epoch": 15.367103016505407,
"grad_norm": 0.8219364285469055,
"learning_rate": 0.0006690933624622503,
"loss": 1.8965240478515626,
"step": 54000
},
{
"epoch": 15.509391007398975,
"grad_norm": 0.8361454606056213,
"learning_rate": 0.0006658806142774529,
"loss": 1.89189599609375,
"step": 54500
},
{
"epoch": 15.651678998292544,
"grad_norm": 0.8974409699440002,
"learning_rate": 0.0006626678660926557,
"loss": 1.89478369140625,
"step": 55000
},
{
"epoch": 15.793966989186112,
"grad_norm": 0.880893886089325,
"learning_rate": 0.0006594551179078584,
"loss": 1.8854979248046875,
"step": 55500
},
{
"epoch": 15.936254980079681,
"grad_norm": 0.8354572057723999,
"learning_rate": 0.0006562423697230612,
"loss": 1.884849853515625,
"step": 56000
},
{
"epoch": 16.07854297097325,
"grad_norm": 0.8348339200019836,
"learning_rate": 0.0006530296215382638,
"loss": 1.8829962158203124,
"step": 56500
},
{
"epoch": 16.220830961866817,
"grad_norm": 0.828911304473877,
"learning_rate": 0.0006498168733534666,
"loss": 1.8815081787109376,
"step": 57000
},
{
"epoch": 16.363118952760388,
"grad_norm": 0.8096295595169067,
"learning_rate": 0.0006466041251686693,
"loss": 1.87695361328125,
"step": 57500
},
{
"epoch": 16.505406943653956,
"grad_norm": 0.8548514246940613,
"learning_rate": 0.0006433913769838721,
"loss": 1.8754146728515626,
"step": 58000
},
{
"epoch": 16.647694934547523,
"grad_norm": 0.8417186737060547,
"learning_rate": 0.0006401786287990747,
"loss": 1.8718682861328124,
"step": 58500
},
{
"epoch": 16.78998292544109,
"grad_norm": 0.8816096186637878,
"learning_rate": 0.0006369658806142775,
"loss": 1.876389404296875,
"step": 59000
},
{
"epoch": 16.932270916334662,
"grad_norm": 0.8592162728309631,
"learning_rate": 0.0006337531324294802,
"loss": 1.8765806884765626,
"step": 59500
},
{
"epoch": 17.07455890722823,
"grad_norm": 0.8253895044326782,
"learning_rate": 0.0006305403842446829,
"loss": 1.862882568359375,
"step": 60000
},
{
"epoch": 17.216846898121798,
"grad_norm": 0.848976731300354,
"learning_rate": 0.0006273276360598856,
"loss": 1.860851806640625,
"step": 60500
},
{
"epoch": 17.359134889015365,
"grad_norm": 0.8160614967346191,
"learning_rate": 0.0006241148878750884,
"loss": 1.8602349853515625,
"step": 61000
},
{
"epoch": 17.501422879908937,
"grad_norm": 0.8240634799003601,
"learning_rate": 0.0006209021396902911,
"loss": 1.855229736328125,
"step": 61500
},
{
"epoch": 17.643710870802504,
"grad_norm": 0.82338947057724,
"learning_rate": 0.0006176893915054939,
"loss": 1.8570489501953125,
"step": 62000
},
{
"epoch": 17.785998861696072,
"grad_norm": 0.7808911204338074,
"learning_rate": 0.0006144766433206965,
"loss": 1.8573056640625,
"step": 62500
},
{
"epoch": 17.92828685258964,
"grad_norm": 0.8358622789382935,
"learning_rate": 0.0006112638951358993,
"loss": 1.860720703125,
"step": 63000
},
{
"epoch": 18.07057484348321,
"grad_norm": 0.8733948469161987,
"learning_rate": 0.000608051146951102,
"loss": 1.8501636962890624,
"step": 63500
},
{
"epoch": 18.21286283437678,
"grad_norm": 0.8293560743331909,
"learning_rate": 0.0006048383987663047,
"loss": 1.8436048583984375,
"step": 64000
},
{
"epoch": 18.355150825270346,
"grad_norm": 0.8083025217056274,
"learning_rate": 0.0006016256505815074,
"loss": 1.845773681640625,
"step": 64500
},
{
"epoch": 18.497438816163914,
"grad_norm": 0.8062528967857361,
"learning_rate": 0.0005984129023967102,
"loss": 1.849447998046875,
"step": 65000
},
{
"epoch": 18.639726807057485,
"grad_norm": 0.797907829284668,
"learning_rate": 0.0005952001542119128,
"loss": 1.8439942626953125,
"step": 65500
},
{
"epoch": 18.782014797951053,
"grad_norm": 0.8070884346961975,
"learning_rate": 0.0005919874060271157,
"loss": 1.8405416259765626,
"step": 66000
},
{
"epoch": 18.92430278884462,
"grad_norm": 0.8484400510787964,
"learning_rate": 0.0005887746578423183,
"loss": 1.8404969482421876,
"step": 66500
},
{
"epoch": 19.06659077973819,
"grad_norm": 0.8148825168609619,
"learning_rate": 0.0005855619096575211,
"loss": 1.8372745361328124,
"step": 67000
},
{
"epoch": 19.20887877063176,
"grad_norm": 0.7816134691238403,
"learning_rate": 0.0005823491614727238,
"loss": 1.8346165771484375,
"step": 67500
},
{
"epoch": 19.351166761525327,
"grad_norm": 0.8447558283805847,
"learning_rate": 0.0005791364132879265,
"loss": 1.8296884765625,
"step": 68000
},
{
"epoch": 19.493454752418895,
"grad_norm": 0.8260893225669861,
"learning_rate": 0.0005759236651031293,
"loss": 1.830061279296875,
"step": 68500
},
{
"epoch": 19.635742743312463,
"grad_norm": 0.7893286347389221,
"learning_rate": 0.000572710916918332,
"loss": 1.829275634765625,
"step": 69000
},
{
"epoch": 19.778030734206034,
"grad_norm": 0.8122330904006958,
"learning_rate": 0.0005694981687335346,
"loss": 1.829096923828125,
"step": 69500
},
{
"epoch": 19.9203187250996,
"grad_norm": 0.825334370136261,
"learning_rate": 0.0005662854205487375,
"loss": 1.82976708984375,
"step": 70000
},
{
"epoch": 20.06260671599317,
"grad_norm": 0.8224254846572876,
"learning_rate": 0.0005630726723639401,
"loss": 1.8201314697265625,
"step": 70500
},
{
"epoch": 20.204894706886737,
"grad_norm": 0.8266887068748474,
"learning_rate": 0.0005598599241791428,
"loss": 1.821681884765625,
"step": 71000
},
{
"epoch": 20.34718269778031,
"grad_norm": 0.8528222441673279,
"learning_rate": 0.0005566471759943456,
"loss": 1.813265869140625,
"step": 71500
},
{
"epoch": 20.489470688673876,
"grad_norm": 0.8061295747756958,
"learning_rate": 0.0005534344278095483,
"loss": 1.81572412109375,
"step": 72000
},
{
"epoch": 20.631758679567444,
"grad_norm": 0.8042652010917664,
"learning_rate": 0.000550221679624751,
"loss": 1.8168936767578125,
"step": 72500
},
{
"epoch": 20.77404667046101,
"grad_norm": 0.7869358658790588,
"learning_rate": 0.0005470089314399538,
"loss": 1.814272705078125,
"step": 73000
},
{
"epoch": 20.916334661354583,
"grad_norm": 0.8155378699302673,
"learning_rate": 0.0005437961832551564,
"loss": 1.81436279296875,
"step": 73500
},
{
"epoch": 21.05862265224815,
"grad_norm": 0.7901885509490967,
"learning_rate": 0.0005405834350703593,
"loss": 1.811807861328125,
"step": 74000
},
{
"epoch": 21.200910643141718,
"grad_norm": 0.7788444757461548,
"learning_rate": 0.0005373706868855619,
"loss": 1.8064168701171874,
"step": 74500
},
{
"epoch": 21.343198634035286,
"grad_norm": 0.8150326609611511,
"learning_rate": 0.0005341579387007646,
"loss": 1.7998411865234376,
"step": 75000
},
{
"epoch": 21.485486624928857,
"grad_norm": 0.790658175945282,
"learning_rate": 0.0005309451905159674,
"loss": 1.8009261474609375,
"step": 75500
},
{
"epoch": 21.627774615822425,
"grad_norm": 0.8362455368041992,
"learning_rate": 0.0005277324423311701,
"loss": 1.7981884765625,
"step": 76000
},
{
"epoch": 21.770062606715992,
"grad_norm": 0.8072263598442078,
"learning_rate": 0.0005245196941463727,
"loss": 1.802354248046875,
"step": 76500
},
{
"epoch": 21.91235059760956,
"grad_norm": 0.8608630895614624,
"learning_rate": 0.0005213069459615756,
"loss": 1.7926978759765626,
"step": 77000
},
{
"epoch": 22.05463858850313,
"grad_norm": 0.8350149989128113,
"learning_rate": 0.0005180941977767782,
"loss": 1.7972198486328126,
"step": 77500
},
{
"epoch": 22.1969265793967,
"grad_norm": 0.8605798482894897,
"learning_rate": 0.0005148814495919811,
"loss": 1.7945047607421876,
"step": 78000
},
{
"epoch": 22.339214570290267,
"grad_norm": 0.8054996728897095,
"learning_rate": 0.0005116687014071837,
"loss": 1.7889041748046874,
"step": 78500
},
{
"epoch": 22.481502561183834,
"grad_norm": 0.8018432855606079,
"learning_rate": 0.0005084559532223864,
"loss": 1.7955711669921874,
"step": 79000
},
{
"epoch": 22.623790552077406,
"grad_norm": 0.8167839050292969,
"learning_rate": 0.0005052432050375892,
"loss": 1.789824462890625,
"step": 79500
},
{
"epoch": 22.766078542970973,
"grad_norm": 0.8051914572715759,
"learning_rate": 0.0005020304568527919,
"loss": 1.786076904296875,
"step": 80000
},
{
"epoch": 22.90836653386454,
"grad_norm": 0.8690944314002991,
"learning_rate": 0.0004988177086679945,
"loss": 1.787760009765625,
"step": 80500
},
{
"epoch": 23.05065452475811,
"grad_norm": 0.7857160568237305,
"learning_rate": 0.0004956049604831974,
"loss": 1.78182080078125,
"step": 81000
},
{
"epoch": 23.19294251565168,
"grad_norm": 0.792676568031311,
"learning_rate": 0.0004923922122984001,
"loss": 1.7775865478515624,
"step": 81500
},
{
"epoch": 23.335230506545248,
"grad_norm": 0.8044800162315369,
"learning_rate": 0.0004891794641136028,
"loss": 1.776404052734375,
"step": 82000
},
{
"epoch": 23.477518497438815,
"grad_norm": 0.8092531561851501,
"learning_rate": 0.00048596671592880555,
"loss": 1.7750899658203125,
"step": 82500
},
{
"epoch": 23.619806488332383,
"grad_norm": 0.8156195878982544,
"learning_rate": 0.00048275396774400824,
"loss": 1.7721761474609374,
"step": 83000
},
{
"epoch": 23.762094479225954,
"grad_norm": 0.8054773211479187,
"learning_rate": 0.000479541219559211,
"loss": 1.7737218017578125,
"step": 83500
},
{
"epoch": 23.904382470119522,
"grad_norm": 0.8544410467147827,
"learning_rate": 0.0004763284713744137,
"loss": 1.7725914306640624,
"step": 84000
},
{
"epoch": 24.04667046101309,
"grad_norm": 0.8268908262252808,
"learning_rate": 0.00047311572318961645,
"loss": 1.7661802978515626,
"step": 84500
},
{
"epoch": 24.188958451906657,
"grad_norm": 0.8119874000549316,
"learning_rate": 0.00046990297500481914,
"loss": 1.7655081787109375,
"step": 85000
},
{
"epoch": 24.33124644280023,
"grad_norm": 0.819488525390625,
"learning_rate": 0.0004666902268200219,
"loss": 1.76497509765625,
"step": 85500
},
{
"epoch": 24.473534433693796,
"grad_norm": 0.7789687514305115,
"learning_rate": 0.0004634774786352246,
"loss": 1.7610302734375,
"step": 86000
},
{
"epoch": 24.615822424587364,
"grad_norm": 0.7585675716400146,
"learning_rate": 0.0004602647304504273,
"loss": 1.7648392333984375,
"step": 86500
},
{
"epoch": 24.75811041548093,
"grad_norm": 0.7789969444274902,
"learning_rate": 0.00045705198226563004,
"loss": 1.7639283447265626,
"step": 87000
},
{
"epoch": 24.900398406374503,
"grad_norm": 0.802516758441925,
"learning_rate": 0.0004538392340808328,
"loss": 1.7633868408203126,
"step": 87500
},
{
"epoch": 25.04268639726807,
"grad_norm": 0.8010969161987305,
"learning_rate": 0.0004506264858960355,
"loss": 1.7580526123046876,
"step": 88000
},
{
"epoch": 25.18497438816164,
"grad_norm": 0.8012429475784302,
"learning_rate": 0.0004474137377112382,
"loss": 1.7505838623046874,
"step": 88500
},
{
"epoch": 25.327262379055206,
"grad_norm": 0.8224115967750549,
"learning_rate": 0.00044420098952644094,
"loss": 1.75313427734375,
"step": 89000
},
{
"epoch": 25.469550369948777,
"grad_norm": 0.8361182808876038,
"learning_rate": 0.0004409882413416437,
"loss": 1.75422998046875,
"step": 89500
},
{
"epoch": 25.611838360842345,
"grad_norm": 0.8170642852783203,
"learning_rate": 0.0004377754931568464,
"loss": 1.7506724853515625,
"step": 90000
},
{
"epoch": 25.754126351735913,
"grad_norm": 0.7613300085067749,
"learning_rate": 0.0004345627449720491,
"loss": 1.7525675048828124,
"step": 90500
},
{
"epoch": 25.89641434262948,
"grad_norm": 0.8419134616851807,
"learning_rate": 0.00043134999678725184,
"loss": 1.75163330078125,
"step": 91000
},
{
"epoch": 26.03870233352305,
"grad_norm": 0.7830471992492676,
"learning_rate": 0.0004281372486024546,
"loss": 1.7453779296875,
"step": 91500
},
{
"epoch": 26.18099032441662,
"grad_norm": 0.7521931529045105,
"learning_rate": 0.00042492450041765726,
"loss": 1.7388763427734375,
"step": 92000
},
{
"epoch": 26.323278315310187,
"grad_norm": 0.7945719957351685,
"learning_rate": 0.00042171175223286,
"loss": 1.74236572265625,
"step": 92500
},
{
"epoch": 26.465566306203755,
"grad_norm": 0.8893241882324219,
"learning_rate": 0.00041849900404806274,
"loss": 1.7380269775390624,
"step": 93000
},
{
"epoch": 26.607854297097326,
"grad_norm": 0.7732031345367432,
"learning_rate": 0.0004152862558632655,
"loss": 1.73862939453125,
"step": 93500
},
{
"epoch": 26.750142287990894,
"grad_norm": 0.8094469308853149,
"learning_rate": 0.00041207350767846816,
"loss": 1.7389964599609375,
"step": 94000
},
{
"epoch": 26.89243027888446,
"grad_norm": 0.7714164853096008,
"learning_rate": 0.0004088607594936709,
"loss": 1.7384852294921875,
"step": 94500
},
{
"epoch": 27.03471826977803,
"grad_norm": 0.8252856731414795,
"learning_rate": 0.00040564801130887364,
"loss": 1.7344244384765626,
"step": 95000
},
{
"epoch": 27.1770062606716,
"grad_norm": 0.7841668128967285,
"learning_rate": 0.0004024352631240764,
"loss": 1.7267880859375,
"step": 95500
},
{
"epoch": 27.319294251565168,
"grad_norm": 0.7918238639831543,
"learning_rate": 0.00039922251493927906,
"loss": 1.7263587646484375,
"step": 96000
},
{
"epoch": 27.461582242458736,
"grad_norm": 0.7744492888450623,
"learning_rate": 0.0003960097667544818,
"loss": 1.727615478515625,
"step": 96500
},
{
"epoch": 27.603870233352303,
"grad_norm": 0.833548367023468,
"learning_rate": 0.00039279701856968454,
"loss": 1.728632568359375,
"step": 97000
},
{
"epoch": 27.746158224245875,
"grad_norm": 0.7416006326675415,
"learning_rate": 0.0003895842703848872,
"loss": 1.7263828125,
"step": 97500
},
{
"epoch": 27.888446215139442,
"grad_norm": 0.8390225172042847,
"learning_rate": 0.00038637152220008996,
"loss": 1.7279501953125,
"step": 98000
},
{
"epoch": 28.03073420603301,
"grad_norm": 0.7987998723983765,
"learning_rate": 0.0003831587740152927,
"loss": 1.7208074951171874,
"step": 98500
},
{
"epoch": 28.173022196926578,
"grad_norm": 0.7214421629905701,
"learning_rate": 0.00037994602583049544,
"loss": 1.7175120849609375,
"step": 99000
},
{
"epoch": 28.31531018782015,
"grad_norm": 0.8112098574638367,
"learning_rate": 0.0003767332776456981,
"loss": 1.717369384765625,
"step": 99500
},
{
"epoch": 28.457598178713717,
"grad_norm": 0.825023353099823,
"learning_rate": 0.00037352052946090086,
"loss": 1.7164326171875,
"step": 100000
},
{
"epoch": 28.599886169607284,
"grad_norm": 0.7754949331283569,
"learning_rate": 0.0003703077812761036,
"loss": 1.7194827880859376,
"step": 100500
},
{
"epoch": 28.742174160500852,
"grad_norm": 0.815468430519104,
"learning_rate": 0.00036709503309130634,
"loss": 1.71721142578125,
"step": 101000
},
{
"epoch": 28.884462151394423,
"grad_norm": 0.7674705982208252,
"learning_rate": 0.000363882284906509,
"loss": 1.716550048828125,
"step": 101500
},
{
"epoch": 29.02675014228799,
"grad_norm": 0.8314065933227539,
"learning_rate": 0.00036066953672171176,
"loss": 1.70979833984375,
"step": 102000
},
{
"epoch": 29.16903813318156,
"grad_norm": 0.804320752620697,
"learning_rate": 0.0003574567885369145,
"loss": 1.7049903564453126,
"step": 102500
},
{
"epoch": 29.311326124075126,
"grad_norm": 0.7875852584838867,
"learning_rate": 0.0003542440403521172,
"loss": 1.70514208984375,
"step": 103000
},
{
"epoch": 29.453614114968698,
"grad_norm": 0.7865288257598877,
"learning_rate": 0.0003510312921673199,
"loss": 1.7045543212890626,
"step": 103500
},
{
"epoch": 29.595902105862265,
"grad_norm": 0.8601499199867249,
"learning_rate": 0.00034781854398252267,
"loss": 1.703396728515625,
"step": 104000
},
{
"epoch": 29.738190096755833,
"grad_norm": 0.8171347975730896,
"learning_rate": 0.0003446057957977254,
"loss": 1.7051766357421876,
"step": 104500
},
{
"epoch": 29.8804780876494,
"grad_norm": 0.8439942598342896,
"learning_rate": 0.0003413930476129281,
"loss": 1.702958740234375,
"step": 105000
},
{
"epoch": 30.022766078542972,
"grad_norm": 0.7871956825256348,
"learning_rate": 0.0003381802994281308,
"loss": 1.7031265869140626,
"step": 105500
},
{
"epoch": 30.16505406943654,
"grad_norm": 0.8173360228538513,
"learning_rate": 0.00033496755124333357,
"loss": 1.7017554931640626,
"step": 106000
},
{
"epoch": 30.307342060330107,
"grad_norm": 0.8125076293945312,
"learning_rate": 0.0003317548030585363,
"loss": 1.693796875,
"step": 106500
},
{
"epoch": 30.449630051223675,
"grad_norm": 0.8106098771095276,
"learning_rate": 0.000328542054873739,
"loss": 1.6901024169921874,
"step": 107000
},
{
"epoch": 30.591918042117246,
"grad_norm": 0.8278952240943909,
"learning_rate": 0.00032532930668894173,
"loss": 1.6938135986328124,
"step": 107500
},
{
"epoch": 30.734206033010814,
"grad_norm": 0.8017494082450867,
"learning_rate": 0.00032211655850414447,
"loss": 1.6923419189453126,
"step": 108000
},
{
"epoch": 30.87649402390438,
"grad_norm": 0.825312077999115,
"learning_rate": 0.00031890381031934715,
"loss": 1.6879945068359374,
"step": 108500
},
{
"epoch": 31.01878201479795,
"grad_norm": 0.8192269802093506,
"learning_rate": 0.0003156910621345499,
"loss": 1.69059716796875,
"step": 109000
},
{
"epoch": 31.16107000569152,
"grad_norm": 0.7704429030418396,
"learning_rate": 0.00031247831394975263,
"loss": 1.68362158203125,
"step": 109500
},
{
"epoch": 31.30335799658509,
"grad_norm": 0.781888484954834,
"learning_rate": 0.00030926556576495537,
"loss": 1.6819910888671874,
"step": 110000
},
{
"epoch": 31.445645987478656,
"grad_norm": 0.8602472543716431,
"learning_rate": 0.00030605281758015805,
"loss": 1.6821741943359374,
"step": 110500
},
{
"epoch": 31.587933978372224,
"grad_norm": 0.7637714743614197,
"learning_rate": 0.0003028400693953608,
"loss": 1.6784395751953125,
"step": 111000
},
{
"epoch": 31.730221969265795,
"grad_norm": 0.8043729662895203,
"learning_rate": 0.00029962732121056353,
"loss": 1.681421630859375,
"step": 111500
},
{
"epoch": 31.872509960159363,
"grad_norm": 0.8252000212669373,
"learning_rate": 0.00029641457302576627,
"loss": 1.681087646484375,
"step": 112000
},
{
"epoch": 32.014797951052934,
"grad_norm": 0.802941083908081,
"learning_rate": 0.00029320182484096895,
"loss": 1.6753712158203125,
"step": 112500
},
{
"epoch": 32.1570859419465,
"grad_norm": 0.814416766166687,
"learning_rate": 0.0002899890766561717,
"loss": 1.6733070068359375,
"step": 113000
},
{
"epoch": 32.29937393284007,
"grad_norm": 0.8030642867088318,
"learning_rate": 0.00028677632847137443,
"loss": 1.6685291748046875,
"step": 113500
},
{
"epoch": 32.44166192373363,
"grad_norm": 0.7646543383598328,
"learning_rate": 0.0002835635802865771,
"loss": 1.668842041015625,
"step": 114000
},
{
"epoch": 32.583949914627205,
"grad_norm": 0.770729660987854,
"learning_rate": 0.00028035083210177985,
"loss": 1.675781982421875,
"step": 114500
},
{
"epoch": 32.726237905520776,
"grad_norm": 0.7864305973052979,
"learning_rate": 0.0002771380839169826,
"loss": 1.6711048583984376,
"step": 115000
},
{
"epoch": 32.86852589641434,
"grad_norm": 0.7970394492149353,
"learning_rate": 0.00027392533573218533,
"loss": 1.6688397216796875,
"step": 115500
},
{
"epoch": 33.01081388730791,
"grad_norm": 0.7844989895820618,
"learning_rate": 0.000270712587547388,
"loss": 1.6707685546875,
"step": 116000
},
{
"epoch": 33.15310187820148,
"grad_norm": 0.7790973782539368,
"learning_rate": 0.00026749983936259075,
"loss": 1.663322509765625,
"step": 116500
},
{
"epoch": 33.29538986909505,
"grad_norm": 0.7947200536727905,
"learning_rate": 0.0002642870911777935,
"loss": 1.6612364501953125,
"step": 117000
},
{
"epoch": 33.43767785998862,
"grad_norm": 0.7712826728820801,
"learning_rate": 0.00026107434299299623,
"loss": 1.6583055419921875,
"step": 117500
},
{
"epoch": 33.57996585088218,
"grad_norm": 0.8012517690658569,
"learning_rate": 0.0002578615948081989,
"loss": 1.6572049560546875,
"step": 118000
},
{
"epoch": 33.72225384177575,
"grad_norm": 0.7550643682479858,
"learning_rate": 0.00025464884662340165,
"loss": 1.6559423828125,
"step": 118500
},
{
"epoch": 33.864541832669325,
"grad_norm": 0.857598602771759,
"learning_rate": 0.0002514360984386044,
"loss": 1.655523193359375,
"step": 119000
},
{
"epoch": 34.00682982356289,
"grad_norm": 0.7795873880386353,
"learning_rate": 0.0002482233502538071,
"loss": 1.6634495849609374,
"step": 119500
},
{
"epoch": 34.14911781445646,
"grad_norm": 0.8217109441757202,
"learning_rate": 0.0002450106020690098,
"loss": 1.648638427734375,
"step": 120000
},
{
"epoch": 34.29140580535003,
"grad_norm": 0.8032475709915161,
"learning_rate": 0.00024179785388421255,
"loss": 1.653551025390625,
"step": 120500
},
{
"epoch": 34.433693796243595,
"grad_norm": 0.7690367698669434,
"learning_rate": 0.00023858510569941526,
"loss": 1.65421923828125,
"step": 121000
},
{
"epoch": 34.57598178713717,
"grad_norm": 0.8075783848762512,
"learning_rate": 0.000235372357514618,
"loss": 1.64859423828125,
"step": 121500
},
{
"epoch": 34.71826977803073,
"grad_norm": 0.8210867047309875,
"learning_rate": 0.00023215960932982071,
"loss": 1.650385009765625,
"step": 122000
},
{
"epoch": 34.8605577689243,
"grad_norm": 0.763390839099884,
"learning_rate": 0.00022894686114502345,
"loss": 1.646854248046875,
"step": 122500
},
{
"epoch": 35.00284575981787,
"grad_norm": 0.7536216378211975,
"learning_rate": 0.0002257341129602262,
"loss": 1.6420960693359374,
"step": 123000
},
{
"epoch": 35.14513375071144,
"grad_norm": 0.8205796480178833,
"learning_rate": 0.00022252136477542893,
"loss": 1.6369521484375,
"step": 123500
},
{
"epoch": 35.28742174160501,
"grad_norm": 0.7954224348068237,
"learning_rate": 0.00021930861659063164,
"loss": 1.6364495849609375,
"step": 124000
},
{
"epoch": 35.42970973249858,
"grad_norm": 0.7882758975028992,
"learning_rate": 0.00021609586840583435,
"loss": 1.6430433349609375,
"step": 124500
},
{
"epoch": 35.571997723392144,
"grad_norm": 0.7636738419532776,
"learning_rate": 0.0002128831202210371,
"loss": 1.6426986083984374,
"step": 125000
},
{
"epoch": 35.714285714285715,
"grad_norm": 0.7501616477966309,
"learning_rate": 0.0002096703720362398,
"loss": 1.6390101318359376,
"step": 125500
},
{
"epoch": 35.85657370517928,
"grad_norm": 0.8382527232170105,
"learning_rate": 0.00020645762385144254,
"loss": 1.635440673828125,
"step": 126000
},
{
"epoch": 35.99886169607285,
"grad_norm": 0.7988425493240356,
"learning_rate": 0.00020324487566664525,
"loss": 1.638254150390625,
"step": 126500
},
{
"epoch": 36.14114968696642,
"grad_norm": 0.791145384311676,
"learning_rate": 0.000200032127481848,
"loss": 1.62477197265625,
"step": 127000
},
{
"epoch": 36.283437677859986,
"grad_norm": 0.7497517466545105,
"learning_rate": 0.0001968193792970507,
"loss": 1.632613037109375,
"step": 127500
},
{
"epoch": 36.42572566875356,
"grad_norm": 0.8118484020233154,
"learning_rate": 0.00019360663111225344,
"loss": 1.63175146484375,
"step": 128000
},
{
"epoch": 36.56801365964713,
"grad_norm": 0.8277371525764465,
"learning_rate": 0.00019039388292745615,
"loss": 1.6289234619140625,
"step": 128500
},
{
"epoch": 36.71030165054069,
"grad_norm": 0.7911479473114014,
"learning_rate": 0.0001871811347426589,
"loss": 1.6251759033203126,
"step": 129000
},
{
"epoch": 36.852589641434264,
"grad_norm": 0.7872730493545532,
"learning_rate": 0.0001839683865578616,
"loss": 1.6268780517578125,
"step": 129500
},
{
"epoch": 36.99487763232783,
"grad_norm": 0.7649409174919128,
"learning_rate": 0.00018075563837306432,
"loss": 1.6268775634765624,
"step": 130000
},
{
"epoch": 37.1371656232214,
"grad_norm": 0.8168938159942627,
"learning_rate": 0.00017754289018826706,
"loss": 1.61990966796875,
"step": 130500
},
{
"epoch": 37.27945361411497,
"grad_norm": 0.790477454662323,
"learning_rate": 0.00017433014200346977,
"loss": 1.618332275390625,
"step": 131000
},
{
"epoch": 37.421741605008535,
"grad_norm": 0.7928410768508911,
"learning_rate": 0.0001711173938186725,
"loss": 1.6161146240234374,
"step": 131500
},
{
"epoch": 37.564029595902106,
"grad_norm": 0.7818522453308105,
"learning_rate": 0.00016790464563387522,
"loss": 1.614980712890625,
"step": 132000
},
{
"epoch": 37.70631758679568,
"grad_norm": 0.7645919322967529,
"learning_rate": 0.00016469189744907796,
"loss": 1.6168826904296876,
"step": 132500
},
{
"epoch": 37.84860557768924,
"grad_norm": 0.7470947504043579,
"learning_rate": 0.00016147914926428067,
"loss": 1.6159892578125,
"step": 133000
},
{
"epoch": 37.99089356858281,
"grad_norm": 0.781106173992157,
"learning_rate": 0.0001582664010794834,
"loss": 1.6154237060546874,
"step": 133500
},
{
"epoch": 38.13318155947638,
"grad_norm": 0.8540311455726624,
"learning_rate": 0.00015505365289468612,
"loss": 1.6088677978515624,
"step": 134000
},
{
"epoch": 38.27546955036995,
"grad_norm": 0.8147649168968201,
"learning_rate": 0.00015184090470988886,
"loss": 1.608274658203125,
"step": 134500
},
{
"epoch": 38.41775754126352,
"grad_norm": 0.7976606488227844,
"learning_rate": 0.00014862815652509157,
"loss": 1.6079510498046874,
"step": 135000
},
{
"epoch": 38.56004553215708,
"grad_norm": 0.7808454036712646,
"learning_rate": 0.00014541540834029428,
"loss": 1.6069224853515625,
"step": 135500
},
{
"epoch": 38.702333523050655,
"grad_norm": 0.7801294922828674,
"learning_rate": 0.00014220266015549702,
"loss": 1.6083843994140625,
"step": 136000
},
{
"epoch": 38.844621513944226,
"grad_norm": 0.7674131989479065,
"learning_rate": 0.00013898991197069973,
"loss": 1.605281005859375,
"step": 136500
},
{
"epoch": 38.98690950483779,
"grad_norm": 0.7959563732147217,
"learning_rate": 0.00013577716378590247,
"loss": 1.6055640869140626,
"step": 137000
},
{
"epoch": 39.12919749573136,
"grad_norm": 0.8412840962409973,
"learning_rate": 0.00013256441560110518,
"loss": 1.6029959716796875,
"step": 137500
},
{
"epoch": 39.271485486624925,
"grad_norm": 0.8026737570762634,
"learning_rate": 0.00012935166741630792,
"loss": 1.600421875,
"step": 138000
},
{
"epoch": 39.4137734775185,
"grad_norm": 0.7729793190956116,
"learning_rate": 0.00012613891923151063,
"loss": 1.5968153076171876,
"step": 138500
},
{
"epoch": 39.55606146841207,
"grad_norm": 0.8124834299087524,
"learning_rate": 0.00012292617104671337,
"loss": 1.5938275146484375,
"step": 139000
},
{
"epoch": 39.69834945930563,
"grad_norm": 0.8108460307121277,
"learning_rate": 0.00011971342286191608,
"loss": 1.5965054931640625,
"step": 139500
},
{
"epoch": 39.8406374501992,
"grad_norm": 0.7830720543861389,
"learning_rate": 0.0001165006746771188,
"loss": 1.5987293701171874,
"step": 140000
},
{
"epoch": 39.982925441092775,
"grad_norm": 0.7493749856948853,
"learning_rate": 0.00011328792649232153,
"loss": 1.5955545654296874,
"step": 140500
},
{
"epoch": 40.12521343198634,
"grad_norm": 0.7606357932090759,
"learning_rate": 0.00011007517830752426,
"loss": 1.5879793701171876,
"step": 141000
},
{
"epoch": 40.26750142287991,
"grad_norm": 0.7698628306388855,
"learning_rate": 0.00010686243012272698,
"loss": 1.5900478515625,
"step": 141500
},
{
"epoch": 40.409789413773474,
"grad_norm": 0.8203583359718323,
"learning_rate": 0.0001036496819379297,
"loss": 1.5890828857421875,
"step": 142000
},
{
"epoch": 40.552077404667045,
"grad_norm": 0.7594188451766968,
"learning_rate": 0.00010043693375313243,
"loss": 1.5884957275390625,
"step": 142500
},
{
"epoch": 40.69436539556062,
"grad_norm": 0.8142854571342468,
"learning_rate": 9.722418556833516e-05,
"loss": 1.585271728515625,
"step": 143000
},
{
"epoch": 40.83665338645418,
"grad_norm": 0.867124080657959,
"learning_rate": 9.401143738353787e-05,
"loss": 1.584123046875,
"step": 143500
},
{
"epoch": 40.97894137734775,
"grad_norm": 0.7953840494155884,
"learning_rate": 9.07986891987406e-05,
"loss": 1.5837237548828125,
"step": 144000
},
{
"epoch": 41.12122936824132,
"grad_norm": 0.782312273979187,
"learning_rate": 8.758594101394333e-05,
"loss": 1.58353369140625,
"step": 144500
},
{
"epoch": 41.26351735913489,
"grad_norm": 0.7618634700775146,
"learning_rate": 8.437319282914606e-05,
"loss": 1.58224267578125,
"step": 145000
},
{
"epoch": 41.40580535002846,
"grad_norm": 0.8059329986572266,
"learning_rate": 8.116044464434878e-05,
"loss": 1.5730921630859376,
"step": 145500
},
{
"epoch": 41.54809334092202,
"grad_norm": 0.756024181842804,
"learning_rate": 7.79476964595515e-05,
"loss": 1.57981982421875,
"step": 146000
},
{
"epoch": 41.690381331815594,
"grad_norm": 0.8015180826187134,
"learning_rate": 7.473494827475423e-05,
"loss": 1.578093505859375,
"step": 146500
},
{
"epoch": 41.832669322709165,
"grad_norm": 0.7689457535743713,
"learning_rate": 7.152220008995696e-05,
"loss": 1.575974365234375,
"step": 147000
},
{
"epoch": 41.97495731360273,
"grad_norm": 0.7837737798690796,
"learning_rate": 6.830945190515968e-05,
"loss": 1.57426171875,
"step": 147500
},
{
"epoch": 42.1172453044963,
"grad_norm": 0.758391797542572,
"learning_rate": 6.509670372036241e-05,
"loss": 1.5727047119140625,
"step": 148000
},
{
"epoch": 42.25953329538987,
"grad_norm": 0.7854096293449402,
"learning_rate": 6.188395553556513e-05,
"loss": 1.56796923828125,
"step": 148500
},
{
"epoch": 42.401821286283436,
"grad_norm": 0.7863969206809998,
"learning_rate": 5.867120735076785e-05,
"loss": 1.571415771484375,
"step": 149000
},
{
"epoch": 42.54410927717701,
"grad_norm": 0.7552393674850464,
"learning_rate": 5.5458459165970575e-05,
"loss": 1.5717161865234375,
"step": 149500
},
{
"epoch": 42.68639726807057,
"grad_norm": 0.7907793521881104,
"learning_rate": 5.22457109811733e-05,
"loss": 1.565153564453125,
"step": 150000
},
{
"epoch": 42.82868525896414,
"grad_norm": 0.8196442127227783,
"learning_rate": 4.903296279637602e-05,
"loss": 1.5695428466796875,
"step": 150500
},
{
"epoch": 42.970973249857714,
"grad_norm": 0.7759655714035034,
"learning_rate": 4.5820214611578744e-05,
"loss": 1.5666607666015624,
"step": 151000
},
{
"epoch": 43.11326124075128,
"grad_norm": 0.7889202237129211,
"learning_rate": 4.260746642678147e-05,
"loss": 1.55801416015625,
"step": 151500
},
{
"epoch": 43.25554923164485,
"grad_norm": 0.7735246419906616,
"learning_rate": 3.9394718241984194e-05,
"loss": 1.563793212890625,
"step": 152000
},
{
"epoch": 43.39783722253842,
"grad_norm": 0.783096432685852,
"learning_rate": 3.618197005718692e-05,
"loss": 1.564925048828125,
"step": 152500
},
{
"epoch": 43.540125213431985,
"grad_norm": 0.8447062373161316,
"learning_rate": 3.296922187238964e-05,
"loss": 1.561072509765625,
"step": 153000
},
{
"epoch": 43.682413204325556,
"grad_norm": 0.8156583905220032,
"learning_rate": 2.9756473687592366e-05,
"loss": 1.5635389404296876,
"step": 153500
},
{
"epoch": 43.82470119521912,
"grad_norm": 0.8073771595954895,
"learning_rate": 2.654372550279509e-05,
"loss": 1.55376953125,
"step": 154000
},
{
"epoch": 43.96698918611269,
"grad_norm": 0.8184100389480591,
"learning_rate": 2.3330977317997816e-05,
"loss": 1.5572464599609375,
"step": 154500
},
{
"epoch": 44.10927717700626,
"grad_norm": 0.7976749539375305,
"learning_rate": 2.0118229133200538e-05,
"loss": 1.556713623046875,
"step": 155000
},
{
"epoch": 44.25156516789983,
"grad_norm": 0.7821652889251709,
"learning_rate": 1.6905480948403263e-05,
"loss": 1.5601466064453124,
"step": 155500
},
{
"epoch": 44.3938531587934,
"grad_norm": 0.7849637269973755,
"learning_rate": 1.3692732763605988e-05,
"loss": 1.555856201171875,
"step": 156000
},
{
"epoch": 44.53614114968697,
"grad_norm": 0.8034117221832275,
"learning_rate": 1.0479984578808712e-05,
"loss": 1.5571749267578125,
"step": 156500
},
{
"epoch": 44.67842914058053,
"grad_norm": 0.7568624019622803,
"learning_rate": 7.267236394011437e-06,
"loss": 1.553765625,
"step": 157000
},
{
"epoch": 44.820717131474105,
"grad_norm": 0.7983436584472656,
"learning_rate": 4.054488209214162e-06,
"loss": 1.55248681640625,
"step": 157500
},
{
"epoch": 44.96300512236767,
"grad_norm": 0.8003237247467041,
"learning_rate": 8.417400244168862e-07,
"loss": 1.554723388671875,
"step": 158000
},
{
"epoch": 45.0,
"step": 158130,
"total_flos": 3.365420849747251e+17,
"train_loss": 1.929684388318656,
"train_runtime": 25758.7104,
"train_samples_per_second": 392.825,
"train_steps_per_second": 6.139
}
],
"logging_steps": 500,
"max_steps": 158130,
"num_input_tokens_seen": 0,
"num_train_epochs": 45,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.365420849747251e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}