NuclearDispersionSystem/ant-design-vue-jeecg/node_modules/china-area-data/v5/fetch.js
2023-09-14 14:47:11 +08:00

225 lines
5.0 KiB
JavaScript

const request = require('request-promise')
const iconv = require('iconv-lite')
const cheerio = require('cheerio')
const md5 = require('md5')
const path = require('path')
const fs = require('fs')
const retry = require('async-retry')
const special = require('./special')
function sleep(time) {
return new Promise(resolve => {
console.log(`sleep: ${time}`)
setTimeout(() => {
resolve()
}, time)
})
}
async function getPage(url, returnMeta = false) {
let content = ''
let isCache = false
const id = md5(url)
const file = `./temp/${id}.html`
if (fs.existsSync(file)) {
content = fs.readFileSync(file, 'utf-8')
isCache = true
} else {
await retry(async bail => {
try {
const rs = await request(url, {
resolveWithFullResponse: true,
encoding: null,
})
content = iconv.decode(rs.body, 'gb2312')
fs.writeFileSync(file, content)
return true
} catch (e) {
// bail(new Error('FAIL'))
}
}, {
retries: 50,
minTimeout: 5000,
onRetry: function () {
console.log('retry')
}
})
}
if (returnMeta) {
return {
isCache,
content
}
} else {
return content
}
}
;
(async() => {
try {
} catch (e) {
console.log(e)
}
const cityIds = []
const rs = {
'86': {}
}
const level0 = await getPage(
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html')
const $ = cheerio.load(level0)
const list = $('.provincetr td a')
const provinces = []
const urls = []
list.each((index, el) => {
const url =
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${$(el).attr('href')}`
const data = {
id: $(el).attr('href').split('.')[0] + '0000',
name: $(el).text()
}
rs['86'][data.id] = data.name
urls.push(url)
})
const specialProvinces = {
"710000": "台湾省",
"810000": "香港特别行政区",
"820000": "澳门特别行政区"
}
Object.assign(rs['86'], specialProvinces)
// 市
for (const url of urls) {
const level1 = await getPage(url)
const $ = cheerio.load(level1)
const list = $('.citytr td a')
const countyUrls = []
list.each(async(index, el) => {
const url =
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${$(el).attr('href')}`
// filter number
const text = $(el).text()
if (!/\d+/.test(text)) {
const data = {
id: $(el).attr('href').split('.')[0].slice(3) + '00',
name: $(el).text()
}
const parentId = data.id.slice(0, 2) + '0000'
if (!rs[parentId]) {
rs[parentId] = {}
}
cityIds.push(data.id)
rs[parentId][data.id] = data.name
}
countyUrls.push(url)
})
for (const url of countyUrls) {
const level2 = await getPage(url)
const $ = cheerio.load(level2)
const list = $('.countytr td, .towntr td')
let townUrls = []
const currentUrl = url
for (let index = 0; index < list.length; index++) {
const { data, url } = processLinks($, list, index, currentUrl.split('/')[7] + '/')
if (data.id) {
console.log(data)
if (data.isLink) {
data.id = data.id.split('.')[0].split('/')[1]
} else {
data.id = data.id.replace(/0+$/g, '')
}
const parentId = data.id.slice(0, 4) + '00'
if (!rs[parentId]) {
rs[parentId] = {}
}
rs[parentId][data.id] = data.name
}
if (url) {
townUrls.push(url)
}
}
// console.log('街道数量', townUrls.length)
// townUrls = Array.from(new Set(townUrls))
// for (const town of townUrls) {
// console.log('获取街道')
// console.log('url', url, town)
// console.log(url.split('/')[7])
// const level3 = await getPage(town, true)
// if (!level3.isCache) {
// await sleep(2000)
// }
// const $ = cheerio.load(level3.content)
// const list = $('.towntr td')
//
// for (let index = 0; index < list.length; index++) {
// const { data, url } = processLinks($, list, index)
// // data.id = data.id.split('/')[1].split('.')[0]
// if (data.id) {
// console.log(data)
// }
// }
// }
}
}
setInterval(() => {
// return
fs.writeFileSync('../data.json', JSON.stringify(Object.assign({}, rs,
special), null, 2))
fs.writeFileSync('./data.json', JSON.stringify(Object.assign({}, rs,
special), null, 2))
const _2levelCityIds = cityIds.filter(c => !Object.keys(rs[c] || {}).length)
fs.writeFileSync('./only_2_level_city_id.json', JSON.stringify(
_2levelCityIds, null, 2))
console.log('done')
// process.exit(0)
}, 6000)
})()
function processLinks ($, list, index, linkPrefix = '') {
const el = list[index]
const link = $(el).find('a')
let url
if ($(link[0]).attr('href')) {
url =
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${linkPrefix}${$(link[0]).attr('href')}`
}
let data = {}
if (link.length) {
if (!/\d+/.test($(el).text())) {
data = {
isLink: true,
id: $(link[0]).attr('href'),
name: $(el).text()
}
}
} else {
if (!/\d+/.test($(el).text())) {
data = {
isLink: false,
id: $(list[index - 1]).text(),
name: $(el).text()
}
}
}
// console.log(data, url)
return {
data,
url,
}
}