P5282,30pts
#include<bits/stdc++.h>
namespace Limie{
#define x first
#define y second
using namespace std;
typedef long long LL;
typedef pair<int,int> PII;
typedef unsigned long long ULL;
typedef long double LD;
}
using namespace Limie;
int n,p,B;
struct FastMod{
typedef __uint128_t uint;
LL m,B;
FastMod(const LL&m=2):m(m),B(((uint)(1)<<64)/m){}
friend inline LL operator%(const LL&a,const FastMod&mod)
{LL r=a-mod.m*((uint)(mod.B)*a>>64);return r>=mod.m?r-mod.m:r;}
}mod(2);
const int N=17;
int infact[1<<N];
const LD PI=acosl(-1);
struct Complex{
LD a,b;
inline Complex operator+(const Complex &w)
{return Complex{a+w.a,b+w.b};}
inline Complex operator-(const Complex &w)
{return Complex{a-w.a,b-w.b};}
inline Complex operator*(const Complex &w)
{return Complex{a*w.a-b*w.b,a*w.b+b*w.a};}
}w[1<<N];
void fft(vector<Complex> &a,const int f)
{
int n=a.size();Complex g,h;if(f==1){
for(int len=n>>1;len;len>>=1)
for(int j=0,x=0;j<n;j+=(len<<1),++x)
for(int i=j;i<j+len;++i)
g=a[i],h=a[i+len]*w[x],a[i]=g+h,a[i+len]=g-h;
}else{
for(int len=1;len<n;len<<=1)
for(int j=0,x=0;j<n;j+=(len<<1),++x)
for(int i=j;i<j+len;++i)
g=a[i],h=a[i+len],a[i]=g+h,a[i+len]=(g-h)*w[x];
reverse(a.begin()+1,a.end());
}
}
int qmi(int a,int b=p-2)
{
int ans=1;
while(b){
if(b&1)ans=(LL)ans*a%mod;
a=(LL)a*a%mod,b>>=1;
}return ans;
}
Complex qmi(Complex a,int b)
{
Complex ans={1,0};
while(b){
if(b&1)ans=ans*a;
a=a*a,b>>=1;
}return ans;
}
inline int add(int a,int b){return ((a+=b)>=p)?a-p:a;}
void init()
{
w[0]={1,0};Complex f;
const Complex x={cos(PI/(1<<N-1)),sin(PI/(1<<N-1))};
for(int i=0;i+1<N;i++){
f=qmi(x,1<<N-i-2);
for(int j=(1<<i);j<(1<<i+1);j++)w[j]=f*w[j^(1<<i)];
}
}
void init(int n,int s=1)
{
for(int i=1;i<=n;i++)s=(LL)s*i%mod;infact[n]=qmi(s);
for(int i=n;i;i--)infact[i-1]=(LL)infact[i]*i%mod;
}
struct poly:vector<int>{
friend inline void formalize(poly &a)
{int n=1;while(n<a.size())n<<=1;a.resize(n);return;}
friend void operator*=(poly &a,poly b)
{
static vector<Complex> x,y,z;
int len=a.size()+b.size()-1,n=1;
while(n<len)n<<=1;x.clear(),y.clear(),z.clear();
x.resize(n),y.resize(n),z.resize(n);
for(int i=0;i<a.size();i++)x[i].a=a[i]&32767,y[i].a=a[i]>>15;
for(int i=0;i<b.size();i++)z[i]={b[i]&32767,b[i]>>15};
fft(x,1),fft(y,1),fft(z,1);
for(int i=0;i<n;i++)x[i]=x[i]*z[i],y[i]=y[i]*z[i];
fft(x,-1),fft(y,-1);a.resize(len);
for(int i=0;i<len;i++){
LL a1=(LL)(x[i].a/n+.5)%mod;
LL a2=(LL)((LL)(x[i].b/n+.5)%mod+(LL)(y[i].a/n+.5)%mod)<<15;
LL a3=((LL)(y[i].b/n+.5)%mod)<<30;
a[i]=(a1+a2+a3)%mod;
}
}
};
poly ans,g;
void calc(int m)
{
int n=ans.size()-1,i,s=1;
g.resize(n+1);poly t;t.resize(n*2+1);
for(i=0;i<=n;i++){
g[i]=(LL)ans[i]*infact[i]%mod*infact[n-i]%mod;
if((n-i)&1)g[i]=p-g[i];
}for(i=0;i<=n*2;i++)s=(LL)s*(m-n+i)%mod;s=qmi(s);
for(i=0;i<=n*2;i++)t[i]=s;
s=1;for(i=0;i<=n*2;i++)t[i]=(LL)t[i]*s%mod,s=(LL)s*(m-n+i)%mod;
s=1;for(i=n*2;i>=0;i--)t[i]=(LL)t[i]*s%mod,s=(LL)s*(m-n+i)%mod;
g*=t;s=1;for(i=m-n;i<=m;i++)s=(LL)s*i%mod;
for(i=0;i<=n;i++)g[i]=(LL)g[n+i]*s%mod,s=(LL)s*(m+i+1)%mod*t[i]%mod;
g.resize(n+1);
}
void get(int n)
{
if(n==1){
ans.resize(2);ans[0]=1,ans[1]=B+1;
return;
}int m=n>>1,i;
get(m);
calc(m+1);
ans.resize(n+1);
for(i=m+1;i<=n;i++)ans[i]=g[i-m-1];
calc((LL)m*qmi(B)%mod);
for(i=0;i<=n;i++)ans[i]=(LL)ans[i]*g[i]%mod;
if(n&1)
for(i=0;i<=n;i++)ans[i]=(LL)ans[i]*(i*B+n)%mod;
}
void solve()
{
cin>>n>>p,mod=FastMod(p);
B=sqrt(n-1)+1;
init(B);
get(B);
int k=n/B,s=1;
for(int i=0;i<k;i++)s=(LL)s*ans[i]%mod;
for(int i=k*B+1;i<=n;i++)s=((LL)s*i)%mod;
cout<<((LL)s+p)%mod<<'\n';
}
signed main()
{
int _;cin>>_;
int start=clock();
init();
while(_--)solve();
int end=clock();
cerr<<(end-start)*1000.0/CLOCKS_PER_SEC;
}
/*
4
16777216 998244353
72267859 998244353
2333333 19260817
1919810 2147481811
*/