"""将source post对应的posts划分成不定长的post batch序列 Params: weibo_id (str), source post对应的id,用于读取对应数据 N (int), 时间序列的基准time steps个数 Returns: output (list), interval list, 每一个interval包含一定数量的post index """ # 不同时间间隔内的post数量不必相同) path = "Weibo" + os_sep + "{}.json".format(weibo_id) data = load_rawdata(data_path + path) # 基于weibo id加载包含转帖文本及时间戳的原始数据 tweet_list = [(idx, tweet["t"]) for idx, tweet in enumerate(data)] total_timespan = tweet_list[-1][1] - tweet_list[0][1] # L(i) time_interval = total_timespan / N # l k = 0 pre_max_inters = [] # U_(k_1) while True: # Spliting series by the current time interval k += 1 interval_num = int(total_timespan / time_interval) output, inter_index = ConstructSeries(tweet_list, interval_num, time_interval) max_inters = GetContinueInterval(inter_index) # maximum continue interval index if len(pre_max_inters) < len(max_inters) < N: time_interval = int(time_interval * 0.5) # Shorten the intervals pre_max_inters = max_inters if time_interval == 0: output = output[max_inters[0]:max_inters[-1] + 1] break else: output = output[max_inters[0]:max_inters[-1] + 1] break return output
原创文章,作者:506227337,如若转载,请注明出处:https://blog.ytso.com/273172.html